From f74b756db6a5856f3db86a32322a3101de9b2f3e Mon Sep 17 00:00:00 2001
From: ChaoticByte <contact@chaoticbyte.net>
Date: Fri, 16 Aug 2024 20:18:47 +0200
Subject: [PATCH] Clarify that only english summarization is supported at the
 moment, pin it in the code

---
 README.md          | 18 ++++++++++--------
 audio-summarize.py | 15 +++++++--------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 7bedd55..84420b1 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,10 @@
 
 An audio summarizer that glues together [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [BART](https://huggingface.co/facebook/bart-large-cnn).
 
+## Supported Languages
+
+Only English summarization is supported.
+
 ## Dependencies
 
 - Python 3 (tested: 3.12)
@@ -24,17 +28,15 @@ pip3 install -r requirements.txt
 ### Usage
 
 ```
-./audio-summarize.py -i filepath -o filepath
-                     [--summin n] [--summax n] [--segmax n]
-                     [--lang lang] [-m name]
+./audio-summarize.py -i filepath -o filepath [-m name]
+                   [--summin n] [--summax n] [--segmax n]
 
 options:
   -h, --help   show this help message and exit
-  --summin n   The minimum lenght of a segment summary [10, min: 5]
-  --summax n   The maximum lenght of a segment summary [90, min: 5]
-  --segmax n   The maximum number of tokens per segment [375, 5 - 500]
-  --lang lang  The language of the audio source ['en']
-  -m name      The name of the whisper model to be used ['small.en']
+  --summin n   The minimum lenght of a segment summary [10] (min: 5)
+  --summax n   The maximum lenght of a segment summary [90] (min: 5)
+  --segmax n   The maximum number of tokens per segment [375] (5 - 500)
+  -m name      The name of the whisper model to be used [small.en]
   -i filepath  The path to the media file
   -o filepath  Where to save the output text to
 ```
diff --git a/audio-summarize.py b/audio-summarize.py
index 4bbaaae..d7680a7 100755
--- a/audio-summarize.py
+++ b/audio-summarize.py
@@ -20,12 +20,12 @@ from transformers import pipeline
 
 # Transcription
 
-def transcribe(model_name: str, audio_file: str, language: str) -> str:
+def transcribe(model_name: str, audio_file: str) -> str:
     '''Transcribe the media using faster-whisper'''
     t_chunks = []
     print("* Loading model ", end="", flush=True)
     model = WhisperModel(model_name, device="auto", compute_type="int8")
-    segments, _ = model.transcribe(audio_file, language=language, beam_size=5, condition_on_previous_text=False)
+    segments, _ = model.transcribe(audio_file, language="en", beam_size=5, condition_on_previous_text=False)
     print()
     print("* Transcribing audio ", end="", flush=True)
     for s in segments:
@@ -67,11 +67,10 @@ def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
 if __name__ == "__main__":
     # parse commandline arguments
     argp = ArgumentParser()
-    argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10, min: 5]")
-    argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90, min: 5]")
-    argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, 5 - 500]")
-    argp.add_argument("--lang", metavar="lang", type=str, default="en", help="The language of the audio source ['en']")
-    argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used ['small.en']")
+    argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10] (min: 5)")
+    argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90] (min: 5)")
+    argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375] (5 - 500)")
+    argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used [small.en]")
     argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
     argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
     args = argp.parse_args()
@@ -80,7 +79,7 @@ if __name__ == "__main__":
     args.summax = max(5, args.summax)
     args.segmax = max(5, min(args.segmax, 500))
     # transcribe
-    text = transcribe(args.m, args.i, args.lang).strip()
+    text = transcribe(args.m, args.i).strip()
     # split up into semantic segments & summarize
     chunks = split_text(text, args.segmax)
     summary = summarize(chunks, args.summin, args.summax)