long running, seems broken version

This commit is contained in:
Grzegorz Matoga
2025-06-04 15:40:54 +02:00
parent a0eb802390
commit f78b2364bf
2 changed files with 1410 additions and 477 deletions

View File

@@ -9,13 +9,18 @@ from dotenv import load_dotenv # For loading .env file
# --- Configuration ---
load_dotenv() # Load environment variables from .env file
YOUR_MP4_FILE = "jeden-z-10-final.mp4" # Replace with your MP4 file name
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # Ensure this is set in your .env file
# For OpenAI-Whisper: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3"
# Larger models are more accurate but slower. "small" or "medium" are good next steps for Polish.
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Changed from "base" to "small" for better accuracy
# For FFmpeg audio extraction - set to None to process full audio
PROCESS_AUDIO_DURATION_SECONDS = None # User specified
YOUR_MP4_FILE = "jeden-z-10-final.mp4"
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Using the model that gave the best results
PROCESS_AUDIO_DURATION_SECONDS = None # Set to None for full audio, or e.g. 30 for testing
# Advanced Whisper transcription parameters for tuning
WHISPER_BEAM_SIZE = 5 # Default for non-English is 5. Larger can be more accurate but slower.
# Single low temperature for less randomness, potentially fewer hallucinations.
# Whisper default is a tuple: (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
WHISPER_TEMPERATURE = 0.0
WHISPER_PATIENCE = None # Default is 1.0. Higher values might help with repetition.
WHISPER_NO_SPEECH_THRESHOLD = 0.6 # Default. Lower might detect more faint speech.
# --- Paths ---
CURRENT_DIR = os.getcwd()
@@ -25,8 +30,8 @@ MP4_FILE_PATH = os.path.join(CURRENT_DIR, YOUR_MP4_FILE)
OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"
# --- Device Selection ---
PYANNOTE_DEVICE_TARGET = "cpu" # Default
WHISPER_DEVICE_TARGET = "cpu" # Default
PYANNOTE_DEVICE_TARGET = "cpu"
WHISPER_DEVICE_TARGET = "cpu"
if torch.backends.mps.is_available():
print("MPS device is available.")
@@ -98,11 +103,10 @@ def main():
print("Diarization complete.")
except Exception as e:
print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
if actual_pyannote_device == "mps": # Fallback for Pyannote if MPS fails
if actual_pyannote_device == "mps":
print("Attempting diarization on CPU as a fallback...")
actual_pyannote_device = "cpu"
try:
# Re-initialize pipeline for CPU, as the original might be stuck on MPS
diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=YOUR_HUGGINGFACE_TOKEN
@@ -115,12 +119,11 @@ def main():
import traceback
print(traceback.format_exc())
return
else: # If it wasn't MPS or CPU fallback already failed
else:
import traceback
print(traceback.format_exc())
return
diarization_segments_for_df = []
for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
diarization_segments_for_df.append({
@@ -142,23 +145,33 @@ def main():
transcription_result_openai = None
model = None
transcribe_options = {
"language": "pl",
"fp16": (actual_whisper_device != "cpu"),
"word_timestamps": True,
"verbose": False,
"beam_size": WHISPER_BEAM_SIZE,
"patience": WHISPER_PATIENCE,
"no_speech_threshold": WHISPER_NO_SPEECH_THRESHOLD,
"temperature": WHISPER_TEMPERATURE
}
print(f"DEBUG: Whisper transcribe options: {transcribe_options}")
try:
use_fp16_whisper = (actual_whisper_device != "cpu")
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
# Explicitly set language to Polish
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
except NotImplementedError as nie:
print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
if actual_whisper_device == "mps":
print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
actual_whisper_device = "cpu"
use_fp16_whisper = False
transcribe_options["fp16"] = False # CPU does not use fp16
print(f"DEBUG: Whisper transcribe options (CPU fallback): {transcribe_options}")
try:
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
# Explicitly set language to Polish for CPU fallback as well
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
except Exception as e_cpu:
print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
@@ -220,14 +233,14 @@ def main():
# --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
print("Assigning speaker labels using whisperx.assign_word_speakers...")
result_with_speakers = {"segments": []} # Initialize
result_with_speakers = {"segments": []}
if not formatted_transcript_for_speaker_assignment["segments"]:
print("Skipping speaker assignment as there are no transcribed segments.")
elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
for seg in formatted_transcript_for_speaker_assignment["segments"]:
new_seg = seg.copy()
new_seg['speaker'] = '[NO_DIARIZATION_DATA]' # Indicate no diarization data was available
new_seg['speaker'] = '[NO_DIARIZATION_DATA]'
if 'words' in new_seg:
for word_data in new_seg['words']:
word_data['speaker'] = '[NO_DIARIZATION_DATA]'
@@ -243,7 +256,6 @@ def main():
print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
import traceback
print(traceback.format_exc())
# Fallback: use transcript but mark speakers as unassigned by whisperx
for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
new_seg = seg_data.copy()
new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
@@ -260,14 +272,13 @@ def main():
for _, row in diarize_df.iterrows():
f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
else:
# Iterate through each segment from Whisper/OpenAI, now with speaker labels from whisperx
for segment_info in result_with_speakers["segments"]:
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]') # Default if speaker key is missing after assignment
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]')
text = segment_info.get("text", "").strip()
start_time = float(segment_info.get("start", 0.0))
end_time = float(segment_info.get("end", 0.0))
if not text: # Skip segments with no actual text
if not text:
continue
f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")

File diff suppressed because it is too large Load Diff