So I asked Mistral to make a script, the objective was to make a transcript for a video, based on the mp4 file. It proceweded to make one, with a ton of libraries to handle everything, one to get the audio. one to transcribe it, etc.
This meant that the scope was too wide so I dialed it back enough to simpliffy the problem. First I split the vocals into an mp3 myself and asked it to only make a template srt from an mp3.
This script still equires one library, PyDub. I then asked it to add progress information but it failed, only showing that in the end.
# pip install pydub
import sys
from pydub import AudioSegment
from pydub.silence import detect_silence
def generate_srt(time_intervals, output_file):
"""
Generate an SRT file with specified time intervals.
:param time_intervals: List of tuples containing start and end times in milliseconds
:param output_file: Path to the output SRT file
"""
total_intervals = len(time_intervals)
with open(output_file, 'w') as srt_file:
for index, (start, end) in enumerate(time_intervals, start=1):
start_time = format_time(start)
end_time = format_time(end)
srt_file.write(f"{index}\n")
srt_file.write(f"{start_time} --> {end_time}\n")
srt_file.write("TODO\n\n")
print(f"Writing SRT: {index}/{total_intervals} intervals completed", end='\r', flush=True)
print("\nSRT file generation complete.")
def format_time(milliseconds):
"""
Convert milliseconds to SRT time format (hh:mm:ss,mmm).
:param milliseconds: Time in milliseconds
:return: Formatted time string
"""
hours, remainder = divmod(milliseconds, 3600000)
minutes, remainder = divmod(remainder, 60000)
seconds, milliseconds = divmod(remainder, 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def detect_clips(audio_file_path):
"""
Detect clips in the audio file based on silence.
:param audio_file_path: Path to the audio file
:return: List of tuples containing start and end times in milliseconds
"""
audio = AudioSegment.from_mp3(audio_file_path)
silence_intervals = detect_silence(audio, min_silence_len=500, silence_thresh=-40)
clips = []
start = 0
total_silences = len(silence_intervals)
for i, silence in enumerate(silence_intervals, start=1):
end = silence[0]
if end > start:
clips.append((start, end))
start = silence[1]
print(f"Detecting silence: {i}/{total_silences} intervals completed", end='\r', flush=True)
# Add the last clip if it exists
if start < len(audio):
clips.append((start, len(audio)))
print("\nSilence detection complete.")
return clips
def main(audio_file_path):
# Detect clips in the audio file
time_intervals = detect_clips(audio_file_path)
# Derive the output SRT file path from the audio file path
output_file = audio_file_path.rsplit('.', 1)[0] + '.srt'
# Generate the SRT file
generate_srt(time_intervals, output_file)
print(f"SRT file generated: {output_file}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python generate_srt.py <audio_file_path>")
sys.exit(1)
audio_file_path = sys.argv[1]
main(audio_file_path)
What do you think? It’s a handy script if you are “gpu poor”.