audio-summarize/audio-summarize.py

#!/usr/bin/env python3

# Copyright (c) 2024 Julian Müller (ChaoticByte)

# Disable FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Imports

from argparse import ArgumentParser
from pathlib import Path
from typing import List

from faster_whisper import WhisperModel
from semantic_text_splitter import TextSplitter
from tokenizers import Tokenizer
from transformers import pipeline


# Transcription

def transcribe(model_name: str, audio_file: str) -> str:
    '''Transcribe the media using faster-whisper'''
    t_chunks = []
    print("* Loading model ", end="", flush=True)
    model = WhisperModel(model_name, device="auto", compute_type="int8")
    segments, _ = model.transcribe(audio_file, language="en", beam_size=5, condition_on_previous_text=False)
    print()
    print("* Transcribing audio ", end="", flush=True)
    for s in segments:
        print(".", end="", flush=True)
        t_chunks.append(s.text)
    print()
    t = "".join(t_chunks)
    return t


# NLP

NLP_MODEL = "facebook/bart-large-cnn"

def split_text(t: str, max_tokens: int) -> List[str]:
    '''Split text into semantic segments'''
    print("* Splitting up transcript into semantic segments")
    tokenizer = Tokenizer.from_pretrained(NLP_MODEL)
    splitter = TextSplitter.from_huggingface_tokenizer(
        tokenizer, (int(max_tokens*0.8), max_tokens))
    chunks = splitter.chunks(t)
    return chunks

def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
    '''Summarize all segments (chunks) using a language model'''
    print("* Summarizing transcript segments ", end="", flush=True)
    chunks_summarized = []
    summ = pipeline("summarization", model=NLP_MODEL)
    for c in chunks:
        print(".", end="", flush=True)
        chunks_summarized.append(
            summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())
    print()
    return "\n".join(chunks_summarized)


# Main

if __name__ == "__main__":
    # parse commandline arguments
    argp = ArgumentParser()
    argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10] (min: 5)")
    argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90] (min: 5)")
    argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375] (5 - 500)")
    argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used [small.en]")
    argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
    argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
    args = argp.parse_args()
    # Clamp values
    args.summin = max(5, args.summin)
    args.summax = max(5, args.summax)
    args.segmax = max(5, min(args.segmax, 500))
    # transcribe
    text = transcribe(args.m, args.i).strip()
    # split up into semantic segments & summarize
    chunks = split_text(text, args.segmax)
    summary = summarize(chunks, args.summin, args.summax)
    print(f"\n{summary}\n")
    print(f"* Saving summary to {args.o.__str__()}")
    with args.o.open("w+") as f: # overwrites
        f.write(summary)
Add project files 2024-08-13 20:32:46 +02:00			`#!/usr/bin/env python3`

Add username to LICENSE 2024-08-13 21:38:41 +02:00			`# Copyright (c) 2024 Julian Müller (ChaoticByte)`

Added more comments to the code 2024-08-13 21:46:18 +02:00			`# Disable FutureWarnings`
Add project files 2024-08-13 20:32:46 +02:00			`import warnings`
			`warnings.simplefilter(action='ignore', category=FutureWarning)`

Added more comments to the code 2024-08-13 21:46:18 +02:00			`# Imports`

Add project files 2024-08-13 20:32:46 +02:00			`from argparse import ArgumentParser`
			`from pathlib import Path`
			`from typing import List`

Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`from faster_whisper import WhisperModel`
Add project files 2024-08-13 20:32:46 +02:00			`from semantic_text_splitter import TextSplitter`
			`from tokenizers import Tokenizer`
			`from transformers import pipeline`


Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`# Transcription`

Clarify that only english summarization is supported at the moment, pin it in the code 2024-08-16 20:18:47 +02:00			`def transcribe(model_name: str, audio_file: str) -> str:`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`'''Transcribe the media using faster-whisper'''`
			`t_chunks = []`
			`print("* Loading model ", end="", flush=True)`
			`model = WhisperModel(model_name, device="auto", compute_type="int8")`
Clarify that only english summarization is supported at the moment, pin it in the code 2024-08-16 20:18:47 +02:00			`segments, _ = model.transcribe(audio_file, language="en", beam_size=5, condition_on_previous_text=False)`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`print()`
			`print("* Transcribing audio ", end="", flush=True)`
			`for s in segments:`
			`print(".", end="", flush=True)`
			`t_chunks.append(s.text)`
			`print()`
			`t = "".join(t_chunks)`
Add project files 2024-08-13 20:32:46 +02:00			`return t`

Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00
			`# NLP`

			`NLP_MODEL = "facebook/bart-large-cnn"`

Add project files 2024-08-13 20:32:46 +02:00			`def split_text(t: str, max_tokens: int) -> List[str]:`
Added more comments to the code 2024-08-13 21:46:18 +02:00			`'''Split text into semantic segments'''`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`print("* Splitting up transcript into semantic segments")`
Add project files 2024-08-13 20:32:46 +02:00			`tokenizer = Tokenizer.from_pretrained(NLP_MODEL)`
			`splitter = TextSplitter.from_huggingface_tokenizer(`
			`tokenizer, (int(max_tokens*0.8), max_tokens))`
			`chunks = splitter.chunks(t)`
			`return chunks`

			`def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:`
Added more comments to the code 2024-08-13 21:46:18 +02:00			`'''Summarize all segments (chunks) using a language model'''`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`print("* Summarizing transcript segments ", end="", flush=True)`
Add project files 2024-08-13 20:32:46 +02:00			`chunks_summarized = []`
			`summ = pipeline("summarization", model=NLP_MODEL)`
			`for c in chunks:`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`print(".", end="", flush=True)`
Add project files 2024-08-13 20:32:46 +02:00			`chunks_summarized.append(`
			`summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`print()`
Add project files 2024-08-13 20:32:46 +02:00			`return "\n".join(chunks_summarized)`

Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00
Added more comments to the code 2024-08-13 21:46:18 +02:00			`# Main`
Add project files 2024-08-13 20:32:46 +02:00
			`if __name__ == "__main__":`
Added more comments to the code 2024-08-13 21:46:18 +02:00			`# parse commandline arguments`
Add project files 2024-08-13 20:32:46 +02:00			`argp = ArgumentParser()`
Clarify that only english summarization is supported at the moment, pin it in the code 2024-08-16 20:18:47 +02:00			`argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10] (min: 5)")`
			`argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90] (min: 5)")`
			`argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375] (5 - 500)")`
			`argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used [small.en]")`
Add project files 2024-08-13 20:32:46 +02:00			`argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")`
			`argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")`
			`args = argp.parse_args()`
Added more comments to the code 2024-08-13 21:46:18 +02:00			`# Clamp values`
Add minimum values for --summin, --summax and --segmax 2024-08-13 21:20:28 +02:00			`args.summin = max(5, args.summin)`
			`args.summax = max(5, args.summax)`
			`args.segmax = max(5, min(args.segmax, 500))`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`# transcribe`
Clarify that only english summarization is supported at the moment, pin it in the code 2024-08-16 20:18:47 +02:00			`text = transcribe(args.m, args.i).strip()`
Switch from whisper.cpp to faster-whisper 2024-08-15 22:20:55 +02:00			`# split up into semantic segments & summarize`
Add project files 2024-08-13 20:32:46 +02:00			`chunks = split_text(text, args.segmax)`
			`summary = summarize(chunks, args.summin, args.summax)`
			`print(f"\n{summary}\n")`
			`print(f"* Saving summary to {args.o.__str__()}")`
Added more comments to the code 2024-08-13 21:46:18 +02:00			`with args.o.open("w+") as f: # overwrites`
Add project files 2024-08-13 20:32:46 +02:00			`f.write(summary)`