Clarify that only english summarization is supported at the moment, pin it in the code

This commit is contained in:
ChaoticByte 2024-08-16 20:18:47 +02:00
parent f83043921a
commit f74b756db6
No known key found for this signature in database
2 changed files with 17 additions and 16 deletions

View file

@ -2,6 +2,10 @@
An audio summarizer that glues together [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [BART](https://huggingface.co/facebook/bart-large-cnn). An audio summarizer that glues together [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [BART](https://huggingface.co/facebook/bart-large-cnn).
## Supported Languages
Only English summarization is supported.
## Dependencies ## Dependencies
- Python 3 (tested: 3.12) - Python 3 (tested: 3.12)
@ -24,17 +28,15 @@ pip3 install -r requirements.txt
### Usage ### Usage
``` ```
./audio-summarize.py -i filepath -o filepath ./audio-summarize.py -i filepath -o filepath [-m name]
[--summin n] [--summax n] [--segmax n] [--summin n] [--summax n] [--segmax n]
[--lang lang] [-m name]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--summin n The minimum lenght of a segment summary [10, min: 5] --summin n The minimum lenght of a segment summary [10] (min: 5)
--summax n The maximum lenght of a segment summary [90, min: 5] --summax n The maximum lenght of a segment summary [90] (min: 5)
--segmax n The maximum number of tokens per segment [375, 5 - 500] --segmax n The maximum number of tokens per segment [375] (5 - 500)
--lang lang The language of the audio source ['en'] -m name The name of the whisper model to be used [small.en]
-m name The name of the whisper model to be used ['small.en']
-i filepath The path to the media file -i filepath The path to the media file
-o filepath Where to save the output text to -o filepath Where to save the output text to
``` ```

View file

@ -20,12 +20,12 @@ from transformers import pipeline
# Transcription # Transcription
def transcribe(model_name: str, audio_file: str, language: str) -> str: def transcribe(model_name: str, audio_file: str) -> str:
'''Transcribe the media using faster-whisper''' '''Transcribe the media using faster-whisper'''
t_chunks = [] t_chunks = []
print("* Loading model ", end="", flush=True) print("* Loading model ", end="", flush=True)
model = WhisperModel(model_name, device="auto", compute_type="int8") model = WhisperModel(model_name, device="auto", compute_type="int8")
segments, _ = model.transcribe(audio_file, language=language, beam_size=5, condition_on_previous_text=False) segments, _ = model.transcribe(audio_file, language="en", beam_size=5, condition_on_previous_text=False)
print() print()
print("* Transcribing audio ", end="", flush=True) print("* Transcribing audio ", end="", flush=True)
for s in segments: for s in segments:
@ -67,11 +67,10 @@ def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
if __name__ == "__main__": if __name__ == "__main__":
# parse commandline arguments # parse commandline arguments
argp = ArgumentParser() argp = ArgumentParser()
argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10, min: 5]") argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10] (min: 5)")
argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90, min: 5]") argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90] (min: 5)")
argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, 5 - 500]") argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375] (5 - 500)")
argp.add_argument("--lang", metavar="lang", type=str, default="en", help="The language of the audio source ['en']") argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used [small.en]")
argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used ['small.en']")
argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file") argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to") argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
args = argp.parse_args() args = argp.parse_args()
@ -80,7 +79,7 @@ if __name__ == "__main__":
args.summax = max(5, args.summax) args.summax = max(5, args.summax)
args.segmax = max(5, min(args.segmax, 500)) args.segmax = max(5, min(args.segmax, 500))
# transcribe # transcribe
text = transcribe(args.m, args.i, args.lang).strip() text = transcribe(args.m, args.i).strip()
# split up into semantic segments & summarize # split up into semantic segments & summarize
chunks = split_text(text, args.segmax) chunks = split_text(text, args.segmax)
summary = summarize(chunks, args.summin, args.summax) summary = summarize(chunks, args.summin, args.summax)