This repository has been archived on 2025-09-28. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
audio-summarize/audio-summarize.py

91 lines
3.5 KiB
Python
Raw Normal View History

2024-08-13 20:32:46 +02:00
#!/usr/bin/env python3
2024-08-13 21:38:41 +02:00
# Copyright (c) 2024 Julian Müller (ChaoticByte)
2024-08-13 21:46:18 +02:00
# Disable FutureWarnings
2024-08-13 20:32:46 +02:00
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
2024-08-13 21:46:18 +02:00
# Imports
2024-08-13 20:32:46 +02:00
from argparse import ArgumentParser
from pathlib import Path
from typing import List
from faster_whisper import WhisperModel
2024-08-13 20:32:46 +02:00
from semantic_text_splitter import TextSplitter
from tokenizers import Tokenizer
from transformers import pipeline
# Transcription
def transcribe(model_name: str, audio_file: str, language: str) -> str:
'''Transcribe the media using faster-whisper'''
t_chunks = []
print("* Loading model ", end="", flush=True)
model = WhisperModel(model_name, device="auto", compute_type="int8")
segments, _ = model.transcribe(audio_file, language=language, beam_size=5, condition_on_previous_text=False)
print()
print("* Transcribing audio ", end="", flush=True)
for s in segments:
print(".", end="", flush=True)
t_chunks.append(s.text)
print()
t = "".join(t_chunks)
2024-08-13 20:32:46 +02:00
return t
# NLP
NLP_MODEL = "facebook/bart-large-cnn"
2024-08-13 20:32:46 +02:00
def split_text(t: str, max_tokens: int) -> List[str]:
2024-08-13 21:46:18 +02:00
'''Split text into semantic segments'''
print("* Splitting up transcript into semantic segments")
2024-08-13 20:32:46 +02:00
tokenizer = Tokenizer.from_pretrained(NLP_MODEL)
splitter = TextSplitter.from_huggingface_tokenizer(
tokenizer, (int(max_tokens*0.8), max_tokens))
chunks = splitter.chunks(t)
return chunks
def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
2024-08-13 21:46:18 +02:00
'''Summarize all segments (chunks) using a language model'''
print("* Summarizing transcript segments ", end="", flush=True)
2024-08-13 20:32:46 +02:00
chunks_summarized = []
summ = pipeline("summarization", model=NLP_MODEL)
for c in chunks:
print(".", end="", flush=True)
2024-08-13 20:32:46 +02:00
chunks_summarized.append(
summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())
print()
2024-08-13 20:32:46 +02:00
return "\n".join(chunks_summarized)
2024-08-13 21:46:18 +02:00
# Main
2024-08-13 20:32:46 +02:00
if __name__ == "__main__":
2024-08-13 21:46:18 +02:00
# parse commandline arguments
2024-08-13 20:32:46 +02:00
argp = ArgumentParser()
argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10, min: 5]")
argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90, min: 5]")
argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, 5 - 500]")
argp.add_argument("--lang", metavar="lang", type=str, default="en", help="The language of the audio source ['en']")
argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used ['small.en']")
2024-08-13 20:32:46 +02:00
argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
args = argp.parse_args()
2024-08-13 21:46:18 +02:00
# Clamp values
args.summin = max(5, args.summin)
args.summax = max(5, args.summax)
args.segmax = max(5, min(args.segmax, 500))
# transcribe
text = transcribe(args.m, args.i, args.lang).strip()
# split up into semantic segments & summarize
2024-08-13 20:32:46 +02:00
chunks = split_text(text, args.segmax)
summary = summarize(chunks, args.summin, args.summax)
print(f"\n{summary}\n")
print(f"* Saving summary to {args.o.__str__()}")
2024-08-13 21:46:18 +02:00
with args.o.open("w+") as f: # overwrites
2024-08-13 20:32:46 +02:00
f.write(summary)