This repository has been archived on 2025-09-28. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
audio-summarize/audio-summarize.py

101 lines
3.6 KiB
Python
Raw Normal View History

2024-08-13 20:32:46 +02:00
#!/usr/bin/env python3
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from argparse import ArgumentParser
from pathlib import Path
from subprocess import check_call, DEVNULL
from tempfile import TemporaryDirectory
from typing import List
from semantic_text_splitter import TextSplitter
from tokenizers import Tokenizer
from transformers import pipeline
NLP_MODEL = "facebook/bart-large-cnn"
root_dir = Path(__file__).parent
whisper_cpp_binary = (root_dir / "vendor" / "whisper.cpp" / "main").__str__()
# tasks
def convert_audio(media_file: str, output_file: str):
check_call([
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-i", media_file,
"-ac", "1",
"-ar", "16000",
"-c:a", "pcm_s16le",
output_file])
def transcribe(model_file: str, audio_file: str, output_file: str):
check_call([
whisper_cpp_binary,
"-m", model_file,
"--max-context", "64",
"--beam-size", "5",
"--no-prints",
"--no-timestamps",
"--output-txt",
"--output-file", output_file[:-4], # strip '.txt' file ending
audio_file], stdout=DEVNULL)
def cleanup_text(t: str) -> str:
t = t.replace("\n", "")
t = t.replace("\r", "")
t = t.strip()
return t
def split_text(t: str, max_tokens: int) -> List[str]:
tokenizer = Tokenizer.from_pretrained(NLP_MODEL)
splitter = TextSplitter.from_huggingface_tokenizer(
tokenizer, (int(max_tokens*0.8), max_tokens))
chunks = splitter.chunks(t)
return chunks
def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
chunks_summarized = []
summ = pipeline("summarization", model=NLP_MODEL)
for c in chunks:
chunks_summarized.append(
summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())
return "\n".join(chunks_summarized)
#
if __name__ == "__main__":
argp = ArgumentParser()
argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10]")
argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90]")
argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, max: 500]")
2024-08-13 20:32:46 +02:00
argp.add_argument("-m", required=True, metavar="filepath", type=Path, help="The path to a whisper.cpp-compatible model file")
argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
args = argp.parse_args()
args.segmax = min(args.segmax, 500)
# create tmpdir
with TemporaryDirectory(suffix="as") as d:
converted_audio_path = (Path(d) / "audio.wav").__str__()
transcript_path = (Path(d) / "transcript.txt").__str__()
# convert using ffmpeg
print("* Converting media to 16kHz 16bit mono WAV")
convert_audio(args.i.__str__(), converted_audio_path)
# transcribe
print("* Transcribing audio")
transcribe(args.m.__str__(), converted_audio_path, transcript_path)
# read transcript
text = Path(transcript_path).read_text()
# cleanup text & summarize
print("* Summarizing transcript")
text = cleanup_text(text)
chunks = split_text(text, args.segmax)
summary = summarize(chunks, args.summin, args.summax)
print(f"\n{summary}\n")
print(f"* Saving summary to {args.o.__str__()}")
with args.o.open("w+") as f:
f.write(summary)