Switch from whisper.cpp to faster-whisper
This commit is contained in:
parent
464ede2444
commit
f83043921a
5 changed files with 47 additions and 112 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,4 +1,3 @@
|
||||||
tmp/
|
tmp/
|
||||||
vendor/*
|
|
||||||
.venv/
|
.venv/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
40
README.md
40
README.md
|
@ -1,48 +1,40 @@
|
||||||
# audio-summarize
|
# audio-summarize
|
||||||
|
|
||||||
An audio summarizer that glues together ffmpeg, whisper.cpp and BART.
|
An audio summarizer that glues together [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [BART](https://huggingface.co/facebook/bart-large-cnn).
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
- Python 3 (tested: 3.12)
|
- Python 3 (tested: 3.12)
|
||||||
- ffmpeg
|
|
||||||
- git
|
|
||||||
- make
|
|
||||||
- c/c++ compiler (on Ubuntu, installing `build-essential` does the trick)
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
Create a virtual environment for python and activate it:
|
Create a virtual environment for python, activate it and install the required python packages:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m venv .venv
|
python3 -m venv .venv
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
```
|
pip3 install -r requirements.txt
|
||||||
|
|
||||||
Run setup.sh
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./setup.sh
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run
|
## Run
|
||||||
|
|
||||||
1. You need a whisper.cpp compatible model file (-> https://huggingface.co/ggerganov/whisper.cpp)
|
1. In your terminal, make shure you have your python venv activated
|
||||||
2. In your terminal, make shure you have your python venv activated
|
2. Run audio-summarize.py
|
||||||
3. Run audio-summarize.py
|
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
./audio-summarize.py -m filepath -i filepath -o filepath
|
./audio-summarize.py -i filepath -o filepath
|
||||||
[--summin n] [--summax n] [--segmax n]
|
[--summin n] [--summax n] [--segmax n]
|
||||||
|
[--lang lang] [-m name]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--summin n The minimum lenght of a segment summary [10, min: 5]
|
--summin n The minimum lenght of a segment summary [10, min: 5]
|
||||||
--summax n The maximum lenght of a segment summary [90, min: 5]
|
--summax n The maximum lenght of a segment summary [90, min: 5]
|
||||||
--segmax n The maximum number of tokens per segment [375, 5 - 500]
|
--segmax n The maximum number of tokens per segment [375, 5 - 500]
|
||||||
-m filepath The path to a whisper.cpp-compatible model file
|
--lang lang The language of the audio source ['en']
|
||||||
|
-m name The name of the whisper model to be used ['small.en']
|
||||||
-i filepath The path to the media file
|
-i filepath The path to the media file
|
||||||
-o filepath Where to save the output text to
|
-o filepath Where to save the output text to
|
||||||
```
|
```
|
||||||
|
@ -50,16 +42,14 @@ options:
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./audio-summarize.py -m ./tmp/whisper_ggml-small.en-q5_1.bin -i ./tmp/test.webm -o ./tmp/output.txt
|
./audio-summarize.py -i ./tmp/test.webm -o ./tmp/output.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## How does it work?
|
## How does it work?
|
||||||
|
|
||||||
To summarize a media file, the program executes the following steps:
|
To summarize a media file, the program executes the following steps:
|
||||||
|
|
||||||
1. Convert the media file with [ffmpeg](https://www.ffmpeg.org/) to a mono 16kHz 16bit-PCM wav file
|
1. Convert and transcribe the media file using [faster-whisper](https://github.com/SYSTRAN/faster-whisper), using [ffmpeg](https://www.ffmpeg.org/) and [ctranslate2](https://github.com/OpenNMT/CTranslate2/) under the hood
|
||||||
2. Transcribe that wav file using [whisper.cpp](https://github.com/ggerganov/whisper.cpp)
|
2. Semantically split up the transcript into segments using [semantic-text-splitter](https://github.com/benbrandt/text-splitter) and the tokenizer for BART
|
||||||
3. Clean up the transcript (newlines, whitespaces at the beginning and end)
|
3. Summarize each segment using BART ([`facebook/bart-large-cnn`](https://huggingface.co/facebook/bart-large-cnn))
|
||||||
4. Semantically split up the transcript into segments using [semantic-text-splitter](https://github.com/benbrandt/text-splitter) and the tokenizer for BART
|
4. Write the results to a text file
|
||||||
5. Summarize each segment using BART ([`facebook/bart-large-cnn`](https://huggingface.co/facebook/bart-large-cnn))
|
|
||||||
6. Write the results to a text file
|
|
||||||
|
|
|
@ -10,55 +10,39 @@ warnings.simplefilter(action='ignore', category=FutureWarning)
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import check_call, DEVNULL
|
|
||||||
from tempfile import TemporaryDirectory
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
from semantic_text_splitter import TextSplitter
|
from semantic_text_splitter import TextSplitter
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
# Some constant variables
|
|
||||||
|
# Transcription
|
||||||
|
|
||||||
|
def transcribe(model_name: str, audio_file: str, language: str) -> str:
|
||||||
|
'''Transcribe the media using faster-whisper'''
|
||||||
|
t_chunks = []
|
||||||
|
print("* Loading model ", end="", flush=True)
|
||||||
|
model = WhisperModel(model_name, device="auto", compute_type="int8")
|
||||||
|
segments, _ = model.transcribe(audio_file, language=language, beam_size=5, condition_on_previous_text=False)
|
||||||
|
print()
|
||||||
|
print("* Transcribing audio ", end="", flush=True)
|
||||||
|
for s in segments:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
t_chunks.append(s.text)
|
||||||
|
print()
|
||||||
|
t = "".join(t_chunks)
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
# NLP
|
||||||
|
|
||||||
NLP_MODEL = "facebook/bart-large-cnn"
|
NLP_MODEL = "facebook/bart-large-cnn"
|
||||||
root_dir = Path(__file__).parent
|
|
||||||
whisper_cpp_binary = (root_dir / "vendor" / "whisper.cpp" / "main").__str__()
|
|
||||||
|
|
||||||
# Steps
|
|
||||||
|
|
||||||
def convert_audio(media_file: str, output_file: str):
|
|
||||||
'''Convert media to mono 16kHz pcm_s16le wav using ffmpeg'''
|
|
||||||
check_call([
|
|
||||||
"ffmpeg",
|
|
||||||
"-hide_banner",
|
|
||||||
"-loglevel", "error",
|
|
||||||
"-i", media_file,
|
|
||||||
"-ac", "1",
|
|
||||||
"-ar", "16000",
|
|
||||||
"-c:a", "pcm_s16le",
|
|
||||||
output_file])
|
|
||||||
|
|
||||||
def transcribe(model_file: str, audio_file: str, output_file: str):
|
|
||||||
'''Transcribe audio file using whisper.cpp'''
|
|
||||||
check_call([
|
|
||||||
whisper_cpp_binary,
|
|
||||||
"-m", model_file,
|
|
||||||
"--max-context", "64",
|
|
||||||
"--beam-size", "5",
|
|
||||||
"--no-prints",
|
|
||||||
"--no-timestamps",
|
|
||||||
"--output-txt",
|
|
||||||
"--output-file", output_file[:-4], # strip '.txt' file ending
|
|
||||||
audio_file], stdout=DEVNULL)
|
|
||||||
|
|
||||||
def cleanup_text(t: str) -> str:
|
|
||||||
t = t.replace("\n", "")
|
|
||||||
t = t.replace("\r", "")
|
|
||||||
t = t.strip()
|
|
||||||
return t
|
|
||||||
|
|
||||||
def split_text(t: str, max_tokens: int) -> List[str]:
|
def split_text(t: str, max_tokens: int) -> List[str]:
|
||||||
'''Split text into semantic segments'''
|
'''Split text into semantic segments'''
|
||||||
|
print("* Splitting up transcript into semantic segments")
|
||||||
tokenizer = Tokenizer.from_pretrained(NLP_MODEL)
|
tokenizer = Tokenizer.from_pretrained(NLP_MODEL)
|
||||||
splitter = TextSplitter.from_huggingface_tokenizer(
|
splitter = TextSplitter.from_huggingface_tokenizer(
|
||||||
tokenizer, (int(max_tokens*0.8), max_tokens))
|
tokenizer, (int(max_tokens*0.8), max_tokens))
|
||||||
|
@ -67,13 +51,17 @@ def split_text(t: str, max_tokens: int) -> List[str]:
|
||||||
|
|
||||||
def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
|
def summarize(chunks: List[str], summary_min: int, summary_max: int) -> str:
|
||||||
'''Summarize all segments (chunks) using a language model'''
|
'''Summarize all segments (chunks) using a language model'''
|
||||||
|
print("* Summarizing transcript segments ", end="", flush=True)
|
||||||
chunks_summarized = []
|
chunks_summarized = []
|
||||||
summ = pipeline("summarization", model=NLP_MODEL)
|
summ = pipeline("summarization", model=NLP_MODEL)
|
||||||
for c in chunks:
|
for c in chunks:
|
||||||
|
print(".", end="", flush=True)
|
||||||
chunks_summarized.append(
|
chunks_summarized.append(
|
||||||
summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())
|
summ(c, max_length=summary_max, min_length=summary_min, do_sample=False)[0]['summary_text'].strip())
|
||||||
|
print()
|
||||||
return "\n".join(chunks_summarized)
|
return "\n".join(chunks_summarized)
|
||||||
|
|
||||||
|
|
||||||
# Main
|
# Main
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -82,7 +70,8 @@ if __name__ == "__main__":
|
||||||
argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10, min: 5]")
|
argp.add_argument("--summin", metavar="n", type=int, default=10, help="The minimum lenght of a segment summary [10, min: 5]")
|
||||||
argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90, min: 5]")
|
argp.add_argument("--summax", metavar="n", type=int, default=90, help="The maximum lenght of a segment summary [90, min: 5]")
|
||||||
argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, 5 - 500]")
|
argp.add_argument("--segmax", metavar="n", type=int, default=375, help="The maximum number of tokens per segment [375, 5 - 500]")
|
||||||
argp.add_argument("-m", required=True, metavar="filepath", type=Path, help="The path to a whisper.cpp-compatible model file")
|
argp.add_argument("--lang", metavar="lang", type=str, default="en", help="The language of the audio source ['en']")
|
||||||
|
argp.add_argument("-m", metavar="name", type=str, default="small.en", help="The name of the whisper model to be used ['small.en']")
|
||||||
argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
|
argp.add_argument("-i", required=True, metavar="filepath", type=Path, help="The path to the media file")
|
||||||
argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
|
argp.add_argument("-o", required=True, metavar="filepath", type=Path, help="Where to save the output text to")
|
||||||
args = argp.parse_args()
|
args = argp.parse_args()
|
||||||
|
@ -90,21 +79,9 @@ if __name__ == "__main__":
|
||||||
args.summin = max(5, args.summin)
|
args.summin = max(5, args.summin)
|
||||||
args.summax = max(5, args.summax)
|
args.summax = max(5, args.summax)
|
||||||
args.segmax = max(5, min(args.segmax, 500))
|
args.segmax = max(5, min(args.segmax, 500))
|
||||||
# create tmpdir
|
# transcribe
|
||||||
with TemporaryDirectory(suffix="as") as d:
|
text = transcribe(args.m, args.i, args.lang).strip()
|
||||||
converted_audio_path = (Path(d) / "audio.wav").__str__()
|
# split up into semantic segments & summarize
|
||||||
transcript_path = (Path(d) / "transcript.txt").__str__()
|
|
||||||
# convert using ffmpeg
|
|
||||||
print("* Converting media to the correct format ...")
|
|
||||||
convert_audio(args.i.__str__(), converted_audio_path)
|
|
||||||
# transcribe
|
|
||||||
print("* Transcribing audio ...")
|
|
||||||
transcribe(args.m.__str__(), converted_audio_path, transcript_path)
|
|
||||||
# read transcript
|
|
||||||
text = Path(transcript_path).read_text()
|
|
||||||
# cleanup text & summarize
|
|
||||||
print("* Summarizing transcript ...")
|
|
||||||
text = cleanup_text(text)
|
|
||||||
chunks = split_text(text, args.segmax)
|
chunks = split_text(text, args.segmax)
|
||||||
summary = summarize(chunks, args.summin, args.summax)
|
summary = summarize(chunks, args.summin, args.summax)
|
||||||
print(f"\n{summary}\n")
|
print(f"\n{summary}\n")
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
faster-whisper
|
||||||
semantic-text-splitter
|
semantic-text-splitter
|
||||||
torch
|
torch
|
||||||
transformers
|
transformers
|
||||||
|
|
32
setup.sh
32
setup.sh
|
@ -1,32 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# init
|
|
||||||
oldcwd=$(pwd)
|
|
||||||
function cleanup {
|
|
||||||
cd ${oldcwd}
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
export root_dir=$(realpath $(dirname $0))
|
|
||||||
export vendor_dir=${root_dir}/vendor
|
|
||||||
|
|
||||||
# Prepare installation of dependencies
|
|
||||||
|
|
||||||
mkdir -p ${vendor_dir}
|
|
||||||
cd ${vendor_dir}
|
|
||||||
|
|
||||||
# Install whisper.cpp
|
|
||||||
|
|
||||||
if [ ! -d ./whisper.cpp ]; then
|
|
||||||
git clone -b v1.6.2 https://github.com/ggerganov/whisper.cpp.git
|
|
||||||
fi
|
|
||||||
cd whisper.cpp
|
|
||||||
make
|
|
||||||
cd ${vendor_dir}
|
|
||||||
|
|
||||||
# Install python packages
|
|
||||||
|
|
||||||
if ! python3 -m pip install -r "${root_dir}/requirements.txt"; then
|
|
||||||
echo
|
|
||||||
echo "Make shure to run this script in a python virtual environment!"
|
|
||||||
fi
|
|
Reference in a new issue