2024-08-13 20:32:46 +02:00
#!/usr/bin/env python3
2024-08-13 21:38:41 +02:00
# Copyright (c) 2024 Julian Müller (ChaoticByte)
2024-08-13 21:46:18 +02:00
# Disable FutureWarnings
2024-08-13 20:32:46 +02:00
import warnings
warnings . simplefilter ( action = ' ignore ' , category = FutureWarning )
2024-08-13 21:46:18 +02:00
# Imports
2024-08-13 20:32:46 +02:00
from argparse import ArgumentParser
from pathlib import Path
from typing import List
2024-08-15 22:20:55 +02:00
from faster_whisper import WhisperModel
2024-08-13 20:32:46 +02:00
from semantic_text_splitter import TextSplitter
from tokenizers import Tokenizer
from transformers import pipeline
2024-08-15 22:20:55 +02:00
# Transcription
2024-08-16 20:18:47 +02:00
def transcribe ( model_name : str , audio_file : str ) - > str :
2024-08-15 22:20:55 +02:00
''' Transcribe the media using faster-whisper '''
t_chunks = [ ]
print ( " * Loading model " , end = " " , flush = True )
model = WhisperModel ( model_name , device = " auto " , compute_type = " int8 " )
2024-08-16 20:18:47 +02:00
segments , _ = model . transcribe ( audio_file , language = " en " , beam_size = 5 , condition_on_previous_text = False )
2024-08-15 22:20:55 +02:00
print ( )
print ( " * Transcribing audio " , end = " " , flush = True )
for s in segments :
print ( " . " , end = " " , flush = True )
t_chunks . append ( s . text )
print ( )
t = " " . join ( t_chunks )
2024-08-13 20:32:46 +02:00
return t
2024-08-15 22:20:55 +02:00
# NLP
NLP_MODEL = " facebook/bart-large-cnn "
2024-08-13 20:32:46 +02:00
def split_text ( t : str , max_tokens : int ) - > List [ str ] :
2024-08-13 21:46:18 +02:00
''' Split text into semantic segments '''
2024-08-15 22:20:55 +02:00
print ( " * Splitting up transcript into semantic segments " )
2024-08-13 20:32:46 +02:00
tokenizer = Tokenizer . from_pretrained ( NLP_MODEL )
splitter = TextSplitter . from_huggingface_tokenizer (
tokenizer , ( int ( max_tokens * 0.8 ) , max_tokens ) )
chunks = splitter . chunks ( t )
return chunks
def summarize ( chunks : List [ str ] , summary_min : int , summary_max : int ) - > str :
2024-08-13 21:46:18 +02:00
''' Summarize all segments (chunks) using a language model '''
2024-08-15 22:20:55 +02:00
print ( " * Summarizing transcript segments " , end = " " , flush = True )
2024-08-13 20:32:46 +02:00
chunks_summarized = [ ]
summ = pipeline ( " summarization " , model = NLP_MODEL )
for c in chunks :
2024-08-15 22:20:55 +02:00
print ( " . " , end = " " , flush = True )
2024-08-13 20:32:46 +02:00
chunks_summarized . append (
summ ( c , max_length = summary_max , min_length = summary_min , do_sample = False ) [ 0 ] [ ' summary_text ' ] . strip ( ) )
2024-08-15 22:20:55 +02:00
print ( )
2024-08-13 20:32:46 +02:00
return " \n " . join ( chunks_summarized )
2024-08-15 22:20:55 +02:00
2024-08-13 21:46:18 +02:00
# Main
2024-08-13 20:32:46 +02:00
if __name__ == " __main__ " :
2024-08-13 21:46:18 +02:00
# parse commandline arguments
2024-08-13 20:32:46 +02:00
argp = ArgumentParser ( )
2024-08-16 20:18:47 +02:00
argp . add_argument ( " --summin " , metavar = " n " , type = int , default = 10 , help = " The minimum lenght of a segment summary [10] (min: 5) " )
argp . add_argument ( " --summax " , metavar = " n " , type = int , default = 90 , help = " The maximum lenght of a segment summary [90] (min: 5) " )
argp . add_argument ( " --segmax " , metavar = " n " , type = int , default = 375 , help = " The maximum number of tokens per segment [375] (5 - 500) " )
argp . add_argument ( " -m " , metavar = " name " , type = str , default = " small.en " , help = " The name of the whisper model to be used [small.en] " )
2024-08-13 20:32:46 +02:00
argp . add_argument ( " -i " , required = True , metavar = " filepath " , type = Path , help = " The path to the media file " )
argp . add_argument ( " -o " , required = True , metavar = " filepath " , type = Path , help = " Where to save the output text to " )
args = argp . parse_args ( )
2024-08-13 21:46:18 +02:00
# Clamp values
2024-08-13 21:20:28 +02:00
args . summin = max ( 5 , args . summin )
args . summax = max ( 5 , args . summax )
args . segmax = max ( 5 , min ( args . segmax , 500 ) )
2024-08-15 22:20:55 +02:00
# transcribe
2024-08-16 20:18:47 +02:00
text = transcribe ( args . m , args . i ) . strip ( )
2024-08-15 22:20:55 +02:00
# split up into semantic segments & summarize
2024-08-13 20:32:46 +02:00
chunks = split_text ( text , args . segmax )
summary = summarize ( chunks , args . summin , args . summax )
print ( f " \n { summary } \n " )
print ( f " * Saving summary to { args . o . __str__ ( ) } " )
2024-08-13 21:46:18 +02:00
with args . o . open ( " w+ " ) as f : # overwrites
2024-08-13 20:32:46 +02:00
f . write ( summary )