-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsrt_translator.py
More file actions
205 lines (174 loc) · 6.63 KB
/
srt_translator.py
File metadata and controls
205 lines (174 loc) · 6.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
import argparse
import os
import re
import sys
import shlex
import time
from math import ceil
import deepl
from langdetect import detect, DetectorFactory
from dotenv import load_dotenv
from tqdm import tqdm
# Read the variables from .env file
load_dotenv()
# Ensure deterministic results from langdetect
DetectorFactory.seed = 0
# Load API key from environment variable instead of hardcoding it
API_KEY = os.getenv("DEEPL_API_KEY")
BATCH_SIZE = 50 # how many segments per batch
def parse_srt(text):
"""Split into [(index, timing, content), ...]."""
entries = re.split(r'\n\s*\n', text.strip())
parsed = []
for entry in entries:
lines = entry.splitlines()
if len(lines) >= 3:
idx = lines[0]
timing = lines[1]
content = "\n".join(lines[2:])
parsed.append((idx, timing, content))
return parsed
def batch_translate(texts, target_lang, translator):
"""Batch texts through DeepL using a progress bar, return list of translations."""
translated_texts = []
# Create a list of chunks to process
chunks = [texts[i:i + BATCH_SIZE] for i in range(0, len(texts), BATCH_SIZE)]
# Wrap the loop with tqdm for a clean progress bar
for chunk in tqdm(chunks, desc=f"Translating to {target_lang}", unit="batch"):
res = translator.translate_text(
chunk,
target_lang=target_lang,
split_sentences="nonewlines"
)
translated_texts.extend([t.text for t in res])
time.sleep(0.1) # Be nice to the API
return translated_texts
def build_srt(blocks, translations):
"""
Given blocks and a dict { 'IT': [...], 'ZH': [...], 'EN-GB': [...] },
emit each block as:
idx
timing
Italian
Chinese
English
(blank)
"""
order = ["IT", "ZH", "EN-GB"]
lines = []
for i, (idx, timing, orig) in enumerate(blocks):
lines.append(idx)
lines.append(timing)
for tgt in order:
lines.append(translations[tgt][i])
lines.append("") # blank separator
return "\n".join(lines)
def get_input_paths(cli_args):
"""
Return a list of SRT paths either from CLI args or
by prompting the user to drag-and-drop one or more files.
"""
if cli_args:
return cli_args
try:
raw = input("Drag your .srt file(s) here and press Enter: ").strip()
if (raw.startswith('"') and raw.endswith('"')) or \
(raw.startswith("'") and raw.endswith("'")):
raw = raw[1:-1]
return shlex.split(raw)
except EOFError:
return []
def normalize_langcode(ld_code):
"""
Map langdetect codes → DeepL codes (IT, ZH, EN).
langdetect returns 'zh-cn' or 'zh-tw' for Chinese.
"""
if ld_code.startswith("zh"):
return "ZH"
if ld_code == "en":
return "EN"
if ld_code == "it":
return "IT"
# anything else, just uppercase primary subtag
return ld_code.split('-')[0].upper()
def main():
# NEW: Check if the API key was successfully loaded.
if not API_KEY:
print(
"Error: DEEPL_API_KEY not found.",
"Please create a .env file in the same directory as the script and add:",
'DEEPL_API_KEY="your-deepl-api-key-here"',
sep="\n",
file=sys.stderr
)
sys.exit(1)
parser = argparse.ArgumentParser(
description="Translate .srt files into trilingual (IT, ZH, EN-GB) via DeepL"
)
parser.add_argument(
"input_srts",
nargs="*",
help="Path(s) to one or more .srt files"
)
args = parser.parse_args()
paths = get_input_paths(args.input_srts)
if not paths:
print("Error: no input files provided.", file=sys.stderr)
sys.exit(1)
translator = deepl.Translator(API_KEY)
for in_path in paths:
if not os.path.isfile(in_path):
print(f"Warning: skipping invalid path {in_path!r}", file=sys.stderr)
continue
# --- FIX STARTS HERE ---
# The original code had a single line that could fail.
# This new block tries several common encodings to be more robust.
print(f"Reading {in_path}...", file=sys.stderr)
raw_text = None
# List of common encodings to try, in order of preference
encodings_to_try = ['utf-8-sig', 'utf-8', 'latin-1', 'windows-1252']
for encoding in encodings_to_try:
try:
with open(in_path, "r", encoding=encoding) as f:
raw_text = f.read()
print(f"Successfully read file with encoding: {encoding}", file=sys.stderr)
break # If successful, stop trying other encodings
except UnicodeDecodeError:
continue # If it fails, try the next encoding in the list
if raw_text is None:
print(f"Error: Could not decode the file {in_path!r}.", file=sys.stderr)
print("Please check the file's encoding and convert it to UTF-8 if possible.", file=sys.stderr)
continue # Skip this file and move to the next one
# --- FIX ENDS HERE ---
blocks = parse_srt(raw_text)
total = len(blocks)
print(f"Parsed {total} subtitle blocks.", file=sys.stderr)
originals = [blk[2] for blk in blocks]
# --- Detect source language from first few segments ---
sample_text = " ".join(originals[:5])
ld_code = detect(sample_text)
source_lang = normalize_langcode(ld_code)
print(f"Detected source language: {source_lang}", file=sys.stderr)
# --- Prepare translations for IT, ZH, EN-GB ---
targets = ["IT", "ZH", "EN-GB"]
translations = {}
for tgt in targets:
# If source == target (or EN→EN-GB), reuse originals:
if (source_lang == tgt) or (source_lang == "EN" and tgt == "EN-GB"):
translations[tgt] = originals
print(f"Reusing originals for {tgt}", file=sys.stderr)
else:
translations[tgt] = batch_translate(originals, tgt, translator)
# --- Build and write the output SRT ---
print("Building output SRT...", file=sys.stderr)
out_content = build_srt(blocks, translations)
base, ext = os.path.splitext(in_path)
out_path = f"{base}_tri_it_zh_en-GB{ext}"
print(f"Writing to {out_path}...", file=sys.stderr)
with open(out_path, "w", encoding="utf-8") as f:
f.write(out_content)
print(f"→ Wrote trilingual SRT → {out_path}", file=sys.stderr)
print("All done! 🎉", file=sys.stderr)
if __name__ == "__main__":
main()