So I asked Mistral to make a script, the objective was to make a transcript for a video, based on the mp4 file. It proceweded to make one, with a ton of libraries to handle everything, one to get the audio. one to transcribe it, etc.

This meant that the scope was too wide so I dialed it back enough to simpliffy the problem. First I split the vocals into an mp3 myself and asked it to only make a template srt from an mp3.

This script still equires one library, PyDub. I then asked it to add progress information but it failed, only showing that in the end.

# pip install pydub

import sys
from pydub import AudioSegment
from pydub.silence import detect_silence

def generate_srt(time_intervals, output_file):
    """
    Generate an SRT file with specified time intervals.

    :param time_intervals: List of tuples containing start and end times in milliseconds
    :param output_file: Path to the output SRT file
    """
    total_intervals = len(time_intervals)
    with open(output_file, 'w') as srt_file:
        for index, (start, end) in enumerate(time_intervals, start=1):
            start_time = format_time(start)
            end_time = format_time(end)
            srt_file.write(f"{index}\n")
            srt_file.write(f"{start_time} --> {end_time}\n")
            srt_file.write("TODO\n\n")
            print(f"Writing SRT: {index}/{total_intervals} intervals completed", end='\r', flush=True)
    print("\nSRT file generation complete.")

def format_time(milliseconds):
    """
    Convert milliseconds to SRT time format (hh:mm:ss,mmm).

    :param milliseconds: Time in milliseconds
    :return: Formatted time string
    """
    hours, remainder = divmod(milliseconds, 3600000)
    minutes, remainder = divmod(remainder, 60000)
    seconds, milliseconds = divmod(remainder, 1000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

def detect_clips(audio_file_path):
    """
    Detect clips in the audio file based on silence.

    :param audio_file_path: Path to the audio file
    :return: List of tuples containing start and end times in milliseconds
    """
    audio = AudioSegment.from_mp3(audio_file_path)
    silence_intervals = detect_silence(audio, min_silence_len=500, silence_thresh=-40)

    clips = []
    start = 0
    total_silences = len(silence_intervals)
    for i, silence in enumerate(silence_intervals, start=1):
        end = silence[0]
        if end > start:
            clips.append((start, end))
        start = silence[1]
        print(f"Detecting silence: {i}/{total_silences} intervals completed", end='\r', flush=True)

    # Add the last clip if it exists
    if start < len(audio):
        clips.append((start, len(audio)))

    print("\nSilence detection complete.")
    return clips

def main(audio_file_path):
    # Detect clips in the audio file
    time_intervals = detect_clips(audio_file_path)

    # Derive the output SRT file path from the audio file path
    output_file = audio_file_path.rsplit('.', 1)[0] + '.srt'

    # Generate the SRT file
    generate_srt(time_intervals, output_file)
    print(f"SRT file generated: {output_file}")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python generate_srt.py <audio_file_path>")
        sys.exit(1)

    audio_file_path = sys.argv[1]
    main(audio_file_path)

What do you think? It’s a handy script if you are “gpu poor”.