long running, seems broken version
This commit is contained in:
@@ -9,13 +9,18 @@ from dotenv import load_dotenv # For loading .env file
|
||||
|
||||
# --- Configuration ---
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
YOUR_MP4_FILE = "jeden-z-10-final.mp4" # Replace with your MP4 file name
|
||||
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # Ensure this is set in your .env file
|
||||
# For OpenAI-Whisper: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3"
|
||||
# Larger models are more accurate but slower. "small" or "medium" are good next steps for Polish.
|
||||
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Changed from "base" to "small" for better accuracy
|
||||
# For FFmpeg audio extraction - set to None to process full audio
|
||||
PROCESS_AUDIO_DURATION_SECONDS = None # User specified
|
||||
YOUR_MP4_FILE = "jeden-z-10-final.mp4"
|
||||
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
||||
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Using the model that gave the best results
|
||||
PROCESS_AUDIO_DURATION_SECONDS = None # Set to None for full audio, or e.g. 30 for testing
|
||||
|
||||
# Advanced Whisper transcription parameters for tuning
|
||||
WHISPER_BEAM_SIZE = 5 # Default for non-English is 5. Larger can be more accurate but slower.
|
||||
# Single low temperature for less randomness, potentially fewer hallucinations.
|
||||
# Whisper default is a tuple: (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
|
||||
WHISPER_TEMPERATURE = 0.0
|
||||
WHISPER_PATIENCE = None # Default is 1.0. Higher values might help with repetition.
|
||||
WHISPER_NO_SPEECH_THRESHOLD = 0.6 # Default. Lower might detect more faint speech.
|
||||
|
||||
# --- Paths ---
|
||||
CURRENT_DIR = os.getcwd()
|
||||
@@ -25,8 +30,8 @@ MP4_FILE_PATH = os.path.join(CURRENT_DIR, YOUR_MP4_FILE)
|
||||
OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"
|
||||
|
||||
# --- Device Selection ---
|
||||
PYANNOTE_DEVICE_TARGET = "cpu" # Default
|
||||
WHISPER_DEVICE_TARGET = "cpu" # Default
|
||||
PYANNOTE_DEVICE_TARGET = "cpu"
|
||||
WHISPER_DEVICE_TARGET = "cpu"
|
||||
|
||||
if torch.backends.mps.is_available():
|
||||
print("MPS device is available.")
|
||||
@@ -98,11 +103,10 @@ def main():
|
||||
print("Diarization complete.")
|
||||
except Exception as e:
|
||||
print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
|
||||
if actual_pyannote_device == "mps": # Fallback for Pyannote if MPS fails
|
||||
if actual_pyannote_device == "mps":
|
||||
print("Attempting diarization on CPU as a fallback...")
|
||||
actual_pyannote_device = "cpu"
|
||||
try:
|
||||
# Re-initialize pipeline for CPU, as the original might be stuck on MPS
|
||||
diarization_pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1",
|
||||
use_auth_token=YOUR_HUGGINGFACE_TOKEN
|
||||
@@ -115,12 +119,11 @@ def main():
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
return
|
||||
else: # If it wasn't MPS or CPU fallback already failed
|
||||
else:
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
return
|
||||
|
||||
|
||||
diarization_segments_for_df = []
|
||||
for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
|
||||
diarization_segments_for_df.append({
|
||||
@@ -142,23 +145,33 @@ def main():
|
||||
|
||||
transcription_result_openai = None
|
||||
model = None
|
||||
transcribe_options = {
|
||||
"language": "pl",
|
||||
"fp16": (actual_whisper_device != "cpu"),
|
||||
"word_timestamps": True,
|
||||
"verbose": False,
|
||||
"beam_size": WHISPER_BEAM_SIZE,
|
||||
"patience": WHISPER_PATIENCE,
|
||||
"no_speech_threshold": WHISPER_NO_SPEECH_THRESHOLD,
|
||||
"temperature": WHISPER_TEMPERATURE
|
||||
}
|
||||
print(f"DEBUG: Whisper transcribe options: {transcribe_options}")
|
||||
|
||||
|
||||
try:
|
||||
use_fp16_whisper = (actual_whisper_device != "cpu")
|
||||
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
||||
# Explicitly set language to Polish
|
||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
|
||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
|
||||
print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
||||
except NotImplementedError as nie:
|
||||
print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
|
||||
if actual_whisper_device == "mps":
|
||||
print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
|
||||
actual_whisper_device = "cpu"
|
||||
use_fp16_whisper = False
|
||||
transcribe_options["fp16"] = False # CPU does not use fp16
|
||||
print(f"DEBUG: Whisper transcribe options (CPU fallback): {transcribe_options}")
|
||||
try:
|
||||
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
||||
# Explicitly set language to Polish for CPU fallback as well
|
||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
|
||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
|
||||
print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
||||
except Exception as e_cpu:
|
||||
print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
|
||||
@@ -220,14 +233,14 @@ def main():
|
||||
|
||||
# --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
|
||||
print("Assigning speaker labels using whisperx.assign_word_speakers...")
|
||||
result_with_speakers = {"segments": []} # Initialize
|
||||
result_with_speakers = {"segments": []}
|
||||
if not formatted_transcript_for_speaker_assignment["segments"]:
|
||||
print("Skipping speaker assignment as there are no transcribed segments.")
|
||||
elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
|
||||
print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
|
||||
for seg in formatted_transcript_for_speaker_assignment["segments"]:
|
||||
new_seg = seg.copy()
|
||||
new_seg['speaker'] = '[NO_DIARIZATION_DATA]' # Indicate no diarization data was available
|
||||
new_seg['speaker'] = '[NO_DIARIZATION_DATA]'
|
||||
if 'words' in new_seg:
|
||||
for word_data in new_seg['words']:
|
||||
word_data['speaker'] = '[NO_DIARIZATION_DATA]'
|
||||
@@ -243,7 +256,6 @@ def main():
|
||||
print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
# Fallback: use transcript but mark speakers as unassigned by whisperx
|
||||
for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
|
||||
new_seg = seg_data.copy()
|
||||
new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
|
||||
@@ -260,14 +272,13 @@ def main():
|
||||
for _, row in diarize_df.iterrows():
|
||||
f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
|
||||
else:
|
||||
# Iterate through each segment from Whisper/OpenAI, now with speaker labels from whisperx
|
||||
for segment_info in result_with_speakers["segments"]:
|
||||
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]') # Default if speaker key is missing after assignment
|
||||
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]')
|
||||
text = segment_info.get("text", "").strip()
|
||||
start_time = float(segment_info.get("start", 0.0))
|
||||
end_time = float(segment_info.get("end", 0.0))
|
||||
|
||||
if not text: # Skip segments with no actual text
|
||||
if not text:
|
||||
continue
|
||||
|
||||
f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user