long running, seems broken version

This commit is contained in:
Grzegorz Matoga
2025-06-04 15:40:54 +02:00
parent a0eb802390
commit f78b2364bf
2 changed files with 1410 additions and 477 deletions

View File

@@ -9,13 +9,18 @@ from dotenv import load_dotenv # For loading .env file
# --- Configuration --- # --- Configuration ---
load_dotenv() # Load environment variables from .env file load_dotenv() # Load environment variables from .env file
YOUR_MP4_FILE = "jeden-z-10-final.mp4" # Replace with your MP4 file name YOUR_MP4_FILE = "jeden-z-10-final.mp4"
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # Ensure this is set in your .env file YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
# For OpenAI-Whisper: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3" OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Using the model that gave the best results
# Larger models are more accurate but slower. "small" or "medium" are good next steps for Polish. PROCESS_AUDIO_DURATION_SECONDS = None # Set to None for full audio, or e.g. 30 for testing
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Changed from "base" to "small" for better accuracy
# For FFmpeg audio extraction - set to None to process full audio # Advanced Whisper transcription parameters for tuning
PROCESS_AUDIO_DURATION_SECONDS = None # User specified WHISPER_BEAM_SIZE = 5 # Default for non-English is 5. Larger can be more accurate but slower.
# Single low temperature for less randomness, potentially fewer hallucinations.
# Whisper default is a tuple: (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
WHISPER_TEMPERATURE = 0.0
WHISPER_PATIENCE = None # Default is 1.0. Higher values might help with repetition.
WHISPER_NO_SPEECH_THRESHOLD = 0.6 # Default. Lower might detect more faint speech.
# --- Paths --- # --- Paths ---
CURRENT_DIR = os.getcwd() CURRENT_DIR = os.getcwd()
@@ -25,8 +30,8 @@ MP4_FILE_PATH = os.path.join(CURRENT_DIR, YOUR_MP4_FILE)
OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt" OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"
# --- Device Selection --- # --- Device Selection ---
PYANNOTE_DEVICE_TARGET = "cpu" # Default PYANNOTE_DEVICE_TARGET = "cpu"
WHISPER_DEVICE_TARGET = "cpu" # Default WHISPER_DEVICE_TARGET = "cpu"
if torch.backends.mps.is_available(): if torch.backends.mps.is_available():
print("MPS device is available.") print("MPS device is available.")
@@ -98,11 +103,10 @@ def main():
print("Diarization complete.") print("Diarization complete.")
except Exception as e: except Exception as e:
print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}") print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
if actual_pyannote_device == "mps": # Fallback for Pyannote if MPS fails if actual_pyannote_device == "mps":
print("Attempting diarization on CPU as a fallback...") print("Attempting diarization on CPU as a fallback...")
actual_pyannote_device = "cpu" actual_pyannote_device = "cpu"
try: try:
# Re-initialize pipeline for CPU, as the original might be stuck on MPS
diarization_pipeline = Pipeline.from_pretrained( diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.1",
use_auth_token=YOUR_HUGGINGFACE_TOKEN use_auth_token=YOUR_HUGGINGFACE_TOKEN
@@ -115,12 +119,11 @@ def main():
import traceback import traceback
print(traceback.format_exc()) print(traceback.format_exc())
return return
else: # If it wasn't MPS or CPU fallback already failed else:
import traceback import traceback
print(traceback.format_exc()) print(traceback.format_exc())
return return
diarization_segments_for_df = [] diarization_segments_for_df = []
for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True): for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
diarization_segments_for_df.append({ diarization_segments_for_df.append({
@@ -142,23 +145,33 @@ def main():
transcription_result_openai = None transcription_result_openai = None
model = None model = None
transcribe_options = {
"language": "pl",
"fp16": (actual_whisper_device != "cpu"),
"word_timestamps": True,
"verbose": False,
"beam_size": WHISPER_BEAM_SIZE,
"patience": WHISPER_PATIENCE,
"no_speech_threshold": WHISPER_NO_SPEECH_THRESHOLD,
"temperature": WHISPER_TEMPERATURE
}
print(f"DEBUG: Whisper transcribe options: {transcribe_options}")
try: try:
use_fp16_whisper = (actual_whisper_device != "cpu")
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device) model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
# Explicitly set language to Polish transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}") print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
except NotImplementedError as nie: except NotImplementedError as nie:
print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}") print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
if actual_whisper_device == "mps": if actual_whisper_device == "mps":
print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.") print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
actual_whisper_device = "cpu" actual_whisper_device = "cpu"
use_fp16_whisper = False transcribe_options["fp16"] = False # CPU does not use fp16
print(f"DEBUG: Whisper transcribe options (CPU fallback): {transcribe_options}")
try: try:
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device) model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
# Explicitly set language to Polish for CPU fallback as well transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}") print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
except Exception as e_cpu: except Exception as e_cpu:
print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}") print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
@@ -220,14 +233,14 @@ def main():
# --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) --- # --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
print("Assigning speaker labels using whisperx.assign_word_speakers...") print("Assigning speaker labels using whisperx.assign_word_speakers...")
result_with_speakers = {"segments": []} # Initialize result_with_speakers = {"segments": []}
if not formatted_transcript_for_speaker_assignment["segments"]: if not formatted_transcript_for_speaker_assignment["segments"]:
print("Skipping speaker assignment as there are no transcribed segments.") print("Skipping speaker assignment as there are no transcribed segments.")
elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]: elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.") print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
for seg in formatted_transcript_for_speaker_assignment["segments"]: for seg in formatted_transcript_for_speaker_assignment["segments"]:
new_seg = seg.copy() new_seg = seg.copy()
new_seg['speaker'] = '[NO_DIARIZATION_DATA]' # Indicate no diarization data was available new_seg['speaker'] = '[NO_DIARIZATION_DATA]'
if 'words' in new_seg: if 'words' in new_seg:
for word_data in new_seg['words']: for word_data in new_seg['words']:
word_data['speaker'] = '[NO_DIARIZATION_DATA]' word_data['speaker'] = '[NO_DIARIZATION_DATA]'
@@ -243,7 +256,6 @@ def main():
print(f"ERROR: whisperx.assign_word_speakers failed: {e}") print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
import traceback import traceback
print(traceback.format_exc()) print(traceback.format_exc())
# Fallback: use transcript but mark speakers as unassigned by whisperx
for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]): for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
new_seg = seg_data.copy() new_seg = seg_data.copy()
new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]' new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
@@ -260,14 +272,13 @@ def main():
for _, row in diarize_df.iterrows(): for _, row in diarize_df.iterrows():
f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n") f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
else: else:
# Iterate through each segment from Whisper/OpenAI, now with speaker labels from whisperx
for segment_info in result_with_speakers["segments"]: for segment_info in result_with_speakers["segments"]:
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]') # Default if speaker key is missing after assignment speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]')
text = segment_info.get("text", "").strip() text = segment_info.get("text", "").strip()
start_time = float(segment_info.get("start", 0.0)) start_time = float(segment_info.get("start", 0.0))
end_time = float(segment_info.get("end", 0.0)) end_time = float(segment_info.get("end", 0.0))
if not text: # Skip segments with no actual text if not text:
continue continue
f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n") f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")

File diff suppressed because it is too large Load Diff