long running, seems broken version

2025-06-04 15:40:54 +02:00
parent a0eb802390
commit f78b2364bf
2 changed files with 1410 additions and 477 deletions
--- a/transcribe.py
+++ b/transcribe.py
@@ -9,13 +9,18 @@ from dotenv import load_dotenv # For loading .env file

 # --- Configuration ---
 load_dotenv()  # Load environment variables from .env file
-YOUR_MP4_FILE = "jeden-z-10-final.mp4"  # Replace with your MP4 file name
-YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # Ensure this is set in your .env file
-# For OpenAI-Whisper: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3"
-# Larger models are more accurate but slower. "small" or "medium" are good next steps for Polish.
-OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Changed from "base" to "small" for better accuracy
-# For FFmpeg audio extraction - set to None to process full audio
-PROCESS_AUDIO_DURATION_SECONDS = None # User specified
+YOUR_MP4_FILE = "jeden-z-10-final.mp4"
+YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Using the model that gave the best results
+PROCESS_AUDIO_DURATION_SECONDS = None # Set to None for full audio, or e.g. 30 for testing
+
+# Advanced Whisper transcription parameters for tuning
+WHISPER_BEAM_SIZE = 5 # Default for non-English is 5. Larger can be more accurate but slower.
+# Single low temperature for less randomness, potentially fewer hallucinations.
+# Whisper default is a tuple: (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
+WHISPER_TEMPERATURE = 0.0
+WHISPER_PATIENCE = None # Default is 1.0. Higher values might help with repetition.
+WHISPER_NO_SPEECH_THRESHOLD = 0.6 # Default. Lower might detect more faint speech.

 # --- Paths ---
 CURRENT_DIR = os.getcwd()
@@ -25,8 +30,8 @@ MP4_FILE_PATH = os.path.join(CURRENT_DIR, YOUR_MP4_FILE)
 OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"

 # --- Device Selection ---
-PYANNOTE_DEVICE_TARGET = "cpu" # Default
-WHISPER_DEVICE_TARGET = "cpu"  # Default
+PYANNOTE_DEVICE_TARGET = "cpu"
+WHISPER_DEVICE_TARGET = "cpu"

 if torch.backends.mps.is_available():
    print("MPS device is available.")
@@ -98,11 +103,10 @@ def main():
        print("Diarization complete.")
    except Exception as e:
        print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
-        if actual_pyannote_device == "mps": # Fallback for Pyannote if MPS fails
+        if actual_pyannote_device == "mps":
            print("Attempting diarization on CPU as a fallback...")
            actual_pyannote_device = "cpu"
            try:
-                # Re-initialize pipeline for CPU, as the original might be stuck on MPS
                diarization_pipeline = Pipeline.from_pretrained(
                    "pyannote/speaker-diarization-3.1",
                    use_auth_token=YOUR_HUGGINGFACE_TOKEN
@@ -115,12 +119,11 @@ def main():
                import traceback
                print(traceback.format_exc())
                return
-        else: # If it wasn't MPS or CPU fallback already failed
+        else:
            import traceback
            print(traceback.format_exc())
            return

-
    diarization_segments_for_df = []
    for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
        diarization_segments_for_df.append({
@@ -142,23 +145,33 @@ def main():
    
    transcription_result_openai = None
    model = None
+    transcribe_options = {
+        "language": "pl",
+        "fp16": (actual_whisper_device != "cpu"),
+        "word_timestamps": True,
+        "verbose": False,
+        "beam_size": WHISPER_BEAM_SIZE,
+        "patience": WHISPER_PATIENCE,
+        "no_speech_threshold": WHISPER_NO_SPEECH_THRESHOLD,
+        "temperature": WHISPER_TEMPERATURE
+    }
+    print(f"DEBUG: Whisper transcribe options: {transcribe_options}")
+

    try:
-        use_fp16_whisper = (actual_whisper_device != "cpu")
        model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
-        # Explicitly set language to Polish
-        transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
+        transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
        print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
    except NotImplementedError as nie: 
        print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
        if actual_whisper_device == "mps":
            print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
            actual_whisper_device = "cpu"
-            use_fp16_whisper = False 
+            transcribe_options["fp16"] = False # CPU does not use fp16
+            print(f"DEBUG: Whisper transcribe options (CPU fallback): {transcribe_options}")
            try:
                model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
-                # Explicitly set language to Polish for CPU fallback as well
-                transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
+                transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
                print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
            except Exception as e_cpu:
                print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
@@ -220,14 +233,14 @@ def main():

    # --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
    print("Assigning speaker labels using whisperx.assign_word_speakers...")
-    result_with_speakers = {"segments": []} # Initialize
+    result_with_speakers = {"segments": []} 
    if not formatted_transcript_for_speaker_assignment["segments"]:
        print("Skipping speaker assignment as there are no transcribed segments.")
    elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
        print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
        for seg in formatted_transcript_for_speaker_assignment["segments"]:
            new_seg = seg.copy()
-            new_seg['speaker'] = '[NO_DIARIZATION_DATA]' # Indicate no diarization data was available
+            new_seg['speaker'] = '[NO_DIARIZATION_DATA]'
            if 'words' in new_seg:
                for word_data in new_seg['words']:
                    word_data['speaker'] = '[NO_DIARIZATION_DATA]'
@@ -243,7 +256,6 @@ def main():
            print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
            import traceback
            print(traceback.format_exc())
-            # Fallback: use transcript but mark speakers as unassigned by whisperx
            for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
                new_seg = seg_data.copy()
                new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
@@ -260,14 +272,13 @@ def main():
                for _, row in diarize_df.iterrows():
                    f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
        else:
-            # Iterate through each segment from Whisper/OpenAI, now with speaker labels from whisperx
            for segment_info in result_with_speakers["segments"]:
-                speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]') # Default if speaker key is missing after assignment
+                speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]')
                text = segment_info.get("text", "").strip()
                start_time = float(segment_info.get("start", 0.0))
                end_time = float(segment_info.get("end", 0.0))

-                if not text: # Skip segments with no actual text
+                if not text: 
                    continue
                
                f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")
--- a/transcript_with_speakers.txt
+++ b/transcript_with_speakers.txt