long running, seems broken version
This commit is contained in:
@@ -9,13 +9,18 @@ from dotenv import load_dotenv # For loading .env file
|
|||||||
|
|
||||||
# --- Configuration ---
|
# --- Configuration ---
|
||||||
load_dotenv() # Load environment variables from .env file
|
load_dotenv() # Load environment variables from .env file
|
||||||
YOUR_MP4_FILE = "jeden-z-10-final.mp4" # Replace with your MP4 file name
|
YOUR_MP4_FILE = "jeden-z-10-final.mp4"
|
||||||
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # Ensure this is set in your .env file
|
YOUR_HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
||||||
# For OpenAI-Whisper: "tiny", "base", "small", "medium", "large", "large-v2", "large-v3"
|
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Using the model that gave the best results
|
||||||
# Larger models are more accurate but slower. "small" or "medium" are good next steps for Polish.
|
PROCESS_AUDIO_DURATION_SECONDS = None # Set to None for full audio, or e.g. 30 for testing
|
||||||
OPENAI_WHISPER_MODEL_SIZE = "large-v3" # Changed from "base" to "small" for better accuracy
|
|
||||||
# For FFmpeg audio extraction - set to None to process full audio
|
# Advanced Whisper transcription parameters for tuning
|
||||||
PROCESS_AUDIO_DURATION_SECONDS = None # User specified
|
WHISPER_BEAM_SIZE = 5 # Default for non-English is 5. Larger can be more accurate but slower.
|
||||||
|
# Single low temperature for less randomness, potentially fewer hallucinations.
|
||||||
|
# Whisper default is a tuple: (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
|
||||||
|
WHISPER_TEMPERATURE = 0.0
|
||||||
|
WHISPER_PATIENCE = None # Default is 1.0. Higher values might help with repetition.
|
||||||
|
WHISPER_NO_SPEECH_THRESHOLD = 0.6 # Default. Lower might detect more faint speech.
|
||||||
|
|
||||||
# --- Paths ---
|
# --- Paths ---
|
||||||
CURRENT_DIR = os.getcwd()
|
CURRENT_DIR = os.getcwd()
|
||||||
@@ -25,8 +30,8 @@ MP4_FILE_PATH = os.path.join(CURRENT_DIR, YOUR_MP4_FILE)
|
|||||||
OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"
|
OUTPUT_TRANSCRIPT_FILE = "transcript_with_speakers.txt"
|
||||||
|
|
||||||
# --- Device Selection ---
|
# --- Device Selection ---
|
||||||
PYANNOTE_DEVICE_TARGET = "cpu" # Default
|
PYANNOTE_DEVICE_TARGET = "cpu"
|
||||||
WHISPER_DEVICE_TARGET = "cpu" # Default
|
WHISPER_DEVICE_TARGET = "cpu"
|
||||||
|
|
||||||
if torch.backends.mps.is_available():
|
if torch.backends.mps.is_available():
|
||||||
print("MPS device is available.")
|
print("MPS device is available.")
|
||||||
@@ -98,11 +103,10 @@ def main():
|
|||||||
print("Diarization complete.")
|
print("Diarization complete.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
|
print(f"ERROR: Speaker diarization on {actual_pyannote_device} failed: {e}")
|
||||||
if actual_pyannote_device == "mps": # Fallback for Pyannote if MPS fails
|
if actual_pyannote_device == "mps":
|
||||||
print("Attempting diarization on CPU as a fallback...")
|
print("Attempting diarization on CPU as a fallback...")
|
||||||
actual_pyannote_device = "cpu"
|
actual_pyannote_device = "cpu"
|
||||||
try:
|
try:
|
||||||
# Re-initialize pipeline for CPU, as the original might be stuck on MPS
|
|
||||||
diarization_pipeline = Pipeline.from_pretrained(
|
diarization_pipeline = Pipeline.from_pretrained(
|
||||||
"pyannote/speaker-diarization-3.1",
|
"pyannote/speaker-diarization-3.1",
|
||||||
use_auth_token=YOUR_HUGGINGFACE_TOKEN
|
use_auth_token=YOUR_HUGGINGFACE_TOKEN
|
||||||
@@ -115,12 +119,11 @@ def main():
|
|||||||
import traceback
|
import traceback
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return
|
return
|
||||||
else: # If it wasn't MPS or CPU fallback already failed
|
else:
|
||||||
import traceback
|
import traceback
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
diarization_segments_for_df = []
|
diarization_segments_for_df = []
|
||||||
for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
|
for segment, _, speaker_label in diarization_result_annotation.itertracks(yield_label=True):
|
||||||
diarization_segments_for_df.append({
|
diarization_segments_for_df.append({
|
||||||
@@ -142,23 +145,33 @@ def main():
|
|||||||
|
|
||||||
transcription_result_openai = None
|
transcription_result_openai = None
|
||||||
model = None
|
model = None
|
||||||
|
transcribe_options = {
|
||||||
|
"language": "pl",
|
||||||
|
"fp16": (actual_whisper_device != "cpu"),
|
||||||
|
"word_timestamps": True,
|
||||||
|
"verbose": False,
|
||||||
|
"beam_size": WHISPER_BEAM_SIZE,
|
||||||
|
"patience": WHISPER_PATIENCE,
|
||||||
|
"no_speech_threshold": WHISPER_NO_SPEECH_THRESHOLD,
|
||||||
|
"temperature": WHISPER_TEMPERATURE
|
||||||
|
}
|
||||||
|
print(f"DEBUG: Whisper transcribe options: {transcribe_options}")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
use_fp16_whisper = (actual_whisper_device != "cpu")
|
|
||||||
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
||||||
# Explicitly set language to Polish
|
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
|
||||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
|
|
||||||
print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
print(f"OpenAI-Whisper transcription on {actual_whisper_device} complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
||||||
except NotImplementedError as nie:
|
except NotImplementedError as nie:
|
||||||
print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
|
print(f"ERROR: OpenAI-Whisper on {actual_whisper_device} failed with NotImplementedError: {nie}")
|
||||||
if actual_whisper_device == "mps":
|
if actual_whisper_device == "mps":
|
||||||
print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
|
print("Falling back to CPU for OpenAI-Whisper due to MPS NotImplementedError.")
|
||||||
actual_whisper_device = "cpu"
|
actual_whisper_device = "cpu"
|
||||||
use_fp16_whisper = False
|
transcribe_options["fp16"] = False # CPU does not use fp16
|
||||||
|
print(f"DEBUG: Whisper transcribe options (CPU fallback): {transcribe_options}")
|
||||||
try:
|
try:
|
||||||
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
model = whisper.load_model(OPENAI_WHISPER_MODEL_SIZE, device=actual_whisper_device)
|
||||||
# Explicitly set language to Polish for CPU fallback as well
|
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, **transcribe_options)
|
||||||
transcription_result_openai = model.transcribe(AUDIO_FILE_PATH, language="pl", fp16=use_fp16_whisper, word_timestamps=True, verbose=False)
|
|
||||||
print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
print(f"OpenAI-Whisper transcription on CPU fallback complete. Detected language (should be pl): {transcription_result_openai.get('language', 'N/A')}")
|
||||||
except Exception as e_cpu:
|
except Exception as e_cpu:
|
||||||
print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
|
print(f"ERROR: OpenAI-Whisper on CPU fallback also failed: {e_cpu}")
|
||||||
@@ -220,14 +233,14 @@ def main():
|
|||||||
|
|
||||||
# --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
|
# --- 5. Assign Speaker Labels (using whisperx.assign_word_speakers) ---
|
||||||
print("Assigning speaker labels using whisperx.assign_word_speakers...")
|
print("Assigning speaker labels using whisperx.assign_word_speakers...")
|
||||||
result_with_speakers = {"segments": []} # Initialize
|
result_with_speakers = {"segments": []}
|
||||||
if not formatted_transcript_for_speaker_assignment["segments"]:
|
if not formatted_transcript_for_speaker_assignment["segments"]:
|
||||||
print("Skipping speaker assignment as there are no transcribed segments.")
|
print("Skipping speaker assignment as there are no transcribed segments.")
|
||||||
elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
|
elif diarize_df.empty and formatted_transcript_for_speaker_assignment["segments"]:
|
||||||
print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
|
print("Warning: No diarization segments, but transcription exists. Output will not have speaker labels.")
|
||||||
for seg in formatted_transcript_for_speaker_assignment["segments"]:
|
for seg in formatted_transcript_for_speaker_assignment["segments"]:
|
||||||
new_seg = seg.copy()
|
new_seg = seg.copy()
|
||||||
new_seg['speaker'] = '[NO_DIARIZATION_DATA]' # Indicate no diarization data was available
|
new_seg['speaker'] = '[NO_DIARIZATION_DATA]'
|
||||||
if 'words' in new_seg:
|
if 'words' in new_seg:
|
||||||
for word_data in new_seg['words']:
|
for word_data in new_seg['words']:
|
||||||
word_data['speaker'] = '[NO_DIARIZATION_DATA]'
|
word_data['speaker'] = '[NO_DIARIZATION_DATA]'
|
||||||
@@ -243,7 +256,6 @@ def main():
|
|||||||
print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
|
print(f"ERROR: whisperx.assign_word_speakers failed: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
# Fallback: use transcript but mark speakers as unassigned by whisperx
|
|
||||||
for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
|
for seg_idx, seg_data in enumerate(formatted_transcript_for_speaker_assignment["segments"]):
|
||||||
new_seg = seg_data.copy()
|
new_seg = seg_data.copy()
|
||||||
new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
|
new_seg['speaker'] = f'[SPEAKER_ASSIGNMENT_FAILED_{seg_idx}]'
|
||||||
@@ -260,14 +272,13 @@ def main():
|
|||||||
for _, row in diarize_df.iterrows():
|
for _, row in diarize_df.iterrows():
|
||||||
f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
|
f.write(f"Speaker: {row['speaker']} ({format_time(row['start'])} - {format_time(row['end'])})\n")
|
||||||
else:
|
else:
|
||||||
# Iterate through each segment from Whisper/OpenAI, now with speaker labels from whisperx
|
|
||||||
for segment_info in result_with_speakers["segments"]:
|
for segment_info in result_with_speakers["segments"]:
|
||||||
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]') # Default if speaker key is missing after assignment
|
speaker_label = segment_info.get('speaker', '[SPEAKER_NOT_ASSIGNED]')
|
||||||
text = segment_info.get("text", "").strip()
|
text = segment_info.get("text", "").strip()
|
||||||
start_time = float(segment_info.get("start", 0.0))
|
start_time = float(segment_info.get("start", 0.0))
|
||||||
end_time = float(segment_info.get("end", 0.0))
|
end_time = float(segment_info.get("end", 0.0))
|
||||||
|
|
||||||
if not text: # Skip segments with no actual text
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")
|
f.write(f"{speaker_label} ({format_time(start_time)} - {format_time(end_time)}): {text}\n\n")
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user