diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 92d145b3c..d24be4c5e 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -6,20 +6,7 @@ 全部按英文识别 全部按日文识别 """ -import psutil -import os -def set_high_priority(): - """把当前 Python 进程设为 HIGH_PRIORITY_CLASS""" - if os.name != "nt": - return # 仅 Windows 有效 - p = psutil.Process(os.getpid()) - try: - p.nice(psutil.HIGH_PRIORITY_CLASS) - print("已将进程优先级设为 High") - except psutil.AccessDenied: - print("权限不足,无法修改优先级(请用管理员运行)") -set_high_priority() import json import logging import os @@ -60,7 +47,6 @@ def set_high_priority(): from TTS_infer_pack.text_segmentation_method import get_method from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config -from tools.assets import css, js, top_html from tools.i18n.i18n import I18nAuto, scan_language_list language = os.environ.get("language", "Auto") @@ -112,28 +98,66 @@ def set_high_priority(): i18n("按标点符号切"): "cut5", } -from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path +# 推理参数预设系统 +INFERENCE_PRESETS = { + i18n("快速合成"): { + "batch_size": 1, + "sample_steps": 8, + "top_k": 5, + "top_p": 1, + "temperature": 1, + "repetition_penalty": 1.35, + "parallel_infer": True, + "split_bucket": True, + }, + i18n("高质量"): { + "batch_size": 1, + "sample_steps": 64, + "top_k": 15, + "top_p": 0.8, + "temperature": 0.8, + "repetition_penalty": 1.35, + "parallel_infer": False, + "split_bucket": False, + }, + i18n("平衡"): { + "batch_size": 20, + "sample_steps": 32, + "top_k": 5, + "top_p": 1, + "temperature": 1, + "repetition_penalty": 1.35, + "parallel_infer": True, + "split_bucket": True, + }, + i18n("自定义"): None, # 不应用任何预设 +} -SoVITS_names, GPT_names = get_weights_names() -from config import pretrained_sovits_name - -path_sovits_v3 = pretrained_sovits_name["v3"] -path_sovits_v4 = pretrained_sovits_name["v4"] -is_exist_s2gv3 = os.path.exists(path_sovits_v3) -is_exist_s2gv4 = os.path.exists(path_sovits_v4) +def apply_preset(preset_name): + """Apply inference preset and return updated parameter values""" + if preset_name == i18n("自定义") or preset_name not in INFERENCE_PRESETS: + # Return current values without changes + return [gr.update() for _ in range(8)] + + preset = INFERENCE_PRESETS[preset_name] + return [ + gr.update(value=preset["batch_size"]), + gr.update(value=preset["sample_steps"]), + gr.update(value=preset["top_k"]), + gr.update(value=preset["top_p"]), + gr.update(value=preset["temperature"]), + gr.update(value=preset["repetition_penalty"]), + gr.update(value=preset["parallel_infer"]), + gr.update(value=preset["split_bucket"]), + ] tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml") tts_config.device = device tts_config.is_half = is_half -# tts_config.version = version -tts_config.update_version(version) +tts_config.version = version if gpt_path is not None: - if "!" in gpt_path or "!" in gpt_path: - gpt_path = name2gpt_path[gpt_path] tts_config.t2s_weights_path = gpt_path if sovits_path is not None: - if "!" in sovits_path or "!" in sovits_path: - sovits_path = name2sovits_path[sovits_path] tts_config.vits_weights_path = sovits_path if cnhubert_base_path is not None: tts_config.cnhuhbert_base_path = cnhubert_base_path @@ -209,6 +233,40 @@ def custom_sort_key(s): return parts +def change_choices(): + SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + + +path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" +path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" +is_exist_s2gv3 = os.path.exists(path_sovits_v3) +is_exist_s2gv4 = os.path.exists(path_sovits_v4) +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) +pretrained_gpt_name, pretrained_sovits_name = _ + if os.path.exists("./weight.json"): pass else: @@ -218,28 +276,50 @@ def custom_sort_key(s): with open("./weight.json", "r", encoding="utf-8") as file: weight_data = file.read() weight_data = json.loads(weight_data) - gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1])) - sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, SoVITS_names[0])) + gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, pretrained_gpt_name)) + sovits_path = os.environ.get("sovits_path", weight_data.get("SoVITS", {}).get(version, pretrained_sovits_name)) if isinstance(gpt_path, list): gpt_path = gpt_path[0] if isinstance(sovits_path, list): sovits_path = sovits_path[0] + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + + +def get_weights_names(GPT_weight_root, SoVITS_weight_root): + SoVITS_names = [i for i in pretrained_sovits_name] + for path in SoVITS_weight_root: + for name in os.listdir(path): + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [i for i in pretrained_gpt_name] + for path in GPT_weight_root: + for name in os.listdir(path): + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) + + from process_ckpt import get_sovits_version_from_path_fast v3v4set = {"v3", "v4"} def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): - if "!" in sovits_path or "!" in sovits_path: - sovits_path = name2sovits_path[sovits_path] global version, model_version, dict_language, if_lora_v3 version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) # print(sovits_path,version, model_version, if_lora_v3) is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 if if_lora_v3 == True and is_exist == False: - info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失,无法加载相应 LoRA 权重") + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) gr.Warning(info) raise FileExistsError(info) dict_language = dict_language_v1 if version == "v1" else dict_language_v2 @@ -297,19 +377,11 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) f.write(json.dumps(data)) -def change_gpt_weights(gpt_path): - if "!" in gpt_path or "!" in gpt_path: - gpt_path = name2gpt_path[gpt_path] - tts_pipeline.init_t2s_weights(gpt_path) - - -with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app: - gr.HTML( - top_html.format( - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") - + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ), - elem_classes="markdown", +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) with gr.Column(): @@ -368,6 +440,14 @@ def change_gpt_weights(gpt_path): with gr.Group(): gr.Markdown(value=i18n("推理设置")) + with gr.Row(): + preset_dropdown = gr.Dropdown( + label=i18n("参数预设"), + choices=list(INFERENCE_PRESETS.keys()), + value=i18n("平衡"), + interactive=True, + info=i18n("选择预设可快速配置推理参数") + ) with gr.Row(): with gr.Column(): with gr.Row(): @@ -382,10 +462,10 @@ def change_gpt_weights(gpt_path): minimum=0.01, maximum=1, step=0.01, label=i18n("分段间隔(秒)"), value=0.3, interactive=True ) speed_factor = gr.Slider( - minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True + minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1.0, interactive=True ) with gr.Row(): - top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True) + top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True) top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True) with gr.Row(): temperature = gr.Slider( @@ -477,7 +557,14 @@ def change_gpt_weights(gpt_path): inference_button, ], ) # - GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) + + # 预设选择事件绑定 + preset_dropdown.change( + apply_preset, + [preset_dropdown], + [batch_size, sample_steps, top_k, top_p, temperature, repetition_penalty, parallel_infer, split_bucket] + ) with gr.Group(): gr.Markdown( diff --git a/api.py b/api.py index cc0896a20..ca97e30f9 100644 --- a/api.py +++ b/api.py @@ -163,7 +163,7 @@ import numpy as np from feature_extractor import cnhubert from io import BytesIO -from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3 +from module.models import SynthesizerTrn, SynthesizerTrnV3 from peft import LoraConfig, get_peft_model from AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence @@ -198,44 +198,39 @@ def is_full(*items): # 任意一项为空返回False return True -bigvgan_model = hifigan_model = sv_cn_model = None - - -def clean_hifigan_model(): - global hifigan_model - if hifigan_model: - hifigan_model = hifigan_model.cpu() - hifigan_model = None - try: - torch.cuda.empty_cache() - except: - pass - - -def clean_bigvgan_model(): - global bigvgan_model - if bigvgan_model: - bigvgan_model = bigvgan_model.cpu() - bigvgan_model = None - try: - torch.cuda.empty_cache() - except: - pass - - -def clean_sv_cn_model(): - global sv_cn_model - if sv_cn_model: - sv_cn_model.embedding_model = sv_cn_model.embedding_model.cpu() - sv_cn_model = None - try: - torch.cuda.empty_cache() - except: - pass +def normalize_api_params(params: dict) -> dict: + """ + Normalize API parameters to support both v1 and v2 naming conventions. + This provides backward compatibility for different API versions. + + Mapping (v2 -> v1): + ref_audio_path -> refer_wav_path + text_lang -> text_language + prompt_lang -> prompt_language + speed_factor -> speed + """ + param_mappings = { + # v2 style -> v1 style + "ref_audio_path": "refer_wav_path", + "text_lang": "text_language", + "prompt_lang": "prompt_language", + "speed_factor": "speed", + "aux_ref_audio_paths": "inp_refs", + } + + normalized = {} + for key, value in params.items(): + # If this is a v2 parameter name, map it to v1; otherwise keep original + normalized_key = param_mappings.get(key, key) + # Don't overwrite if v1 key already exists + if normalized_key not in normalized: + normalized[normalized_key] = value + + return normalized def init_bigvgan(): - global bigvgan_model, hifigan_model, sv_cn_model + global bigvgan_model from BigVGAN import bigvgan bigvgan_model = bigvgan.BigVGAN.from_pretrained( @@ -245,57 +240,20 @@ def init_bigvgan(): # remove weight norm in the model and set to eval mode bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() - if is_half == True: bigvgan_model = bigvgan_model.half().to(device) else: bigvgan_model = bigvgan_model.to(device) -def init_hifigan(): - global hifigan_model, bigvgan_model, sv_cn_model - hifigan_model = Generator( - initial_channel=100, - resblock="1", - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_rates=[10, 6, 2, 2, 2], - upsample_initial_channel=512, - upsample_kernel_sizes=[20, 12, 4, 4, 4], - gin_channels=0, - is_bias=True, - ) - hifigan_model.eval() - hifigan_model.remove_weight_norm() - state_dict_g = torch.load( - "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), - map_location="cpu", - weights_only=False, - ) - print("loading vocoder", hifigan_model.load_state_dict(state_dict_g)) - if is_half == True: - hifigan_model = hifigan_model.half().to(device) - else: - hifigan_model = hifigan_model.to(device) - - -from sv import SV - - -def init_sv_cn(): - global hifigan_model, bigvgan_model, sv_cn_model - sv_cn_model = SV(device, is_half) - - resample_transform_dict = {} -def resample(audio_tensor, sr0, sr1, device): +def resample(audio_tensor, sr0): global resample_transform_dict - key = "%s-%s-%s" % (sr0, sr1, str(device)) - if key not in resample_transform_dict: - resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) - return resample_transform_dict[key](audio_tensor) + if sr0 not in resample_transform_dict: + resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device) + return resample_transform_dict[sr0](audio_tensor) from module.mel_processing import mel_spectrogram_torch @@ -325,19 +283,6 @@ def denorm_spec(x): "center": False, }, ) -mel_fn_v4 = lambda x: mel_spectrogram_torch( - x, - **{ - "n_fft": 1280, - "win_size": 1280, - "hop_size": 320, - "num_mels": 100, - "sampling_rate": 32000, - "fmin": 0, - "fmax": None, - "center": False, - }, -) sr_model = None @@ -379,19 +324,12 @@ def __init__(self, vq_model, hps): def get_sovits_weights(sovits_path): - from config import pretrained_sovits_name - - path_sovits_v3 = pretrained_sovits_name["v3"] - path_sovits_v4 = pretrained_sovits_name["v4"] + path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" is_exist_s2gv3 = os.path.exists(path_sovits_v3) - is_exist_s2gv4 = os.path.exists(path_sovits_v4) version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) - is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 - path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 - - if if_lora_v3 == True and is_exist == False: - logger.info("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) + if if_lora_v3 == True and is_exist_s2gv3 == False: + logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") dict_s2 = load_sovits_new(sovits_path) hps = dict_s2["config"] @@ -404,13 +342,11 @@ def get_sovits_weights(sovits_path): else: hps.model.version = "v2" - model_params_dict = vars(hps.model) - if model_version not in {"v3", "v4"}: - if "Pro" in model_version: - hps.model.version = model_version - if sv_cn_model == None: - init_sv_cn() + if model_version == "v3": + hps.model.version = "v3" + model_params_dict = vars(hps.model) + if model_version != "v3": vq_model = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, @@ -418,18 +354,13 @@ def get_sovits_weights(sovits_path): **model_params_dict, ) else: - hps.model.version = model_version vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **model_params_dict, ) - if model_version == "v3": - init_bigvgan() - if model_version == "v4": - init_hifigan() - + init_bigvgan() model_version = hps.model.version logger.info(f"模型版本: {model_version}") if "pretrained" not in sovits_path: @@ -445,8 +376,7 @@ def get_sovits_weights(sovits_path): if if_lora_v3 == False: vq_model.load_state_dict(dict_s2["weight"], strict=False) else: - path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 - vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False) + vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False) lora_rank = dict_s2["lora_rank"] lora_config = LoraConfig( target_modules=["to_k", "to_q", "to_v", "to_out.0"], @@ -475,7 +405,7 @@ def __init__(self, max_sec, t2s_model): def get_gpt_weights(gpt_path): - dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False) + dict_s1 = torch.load(gpt_path, map_location="cpu") config = dict_s1["config"] max_sec = config["data"]["max_sec"] t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) @@ -543,65 +473,62 @@ def get_bert_inf(phones, word2ph, norm_text, language): def get_phones_and_bert(text, language, version, final=False): - text = re.sub(r' {2,}', ' ', text) - textlist = [] - langlist = [] - if language == "all_zh": - for tmp in LangSegmenter.getTexts(text,"zh"): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "all_yue": - for tmp in LangSegmenter.getTexts(text,"zh"): - if tmp["lang"] == "zh": - tmp["lang"] = "yue" - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "all_ja": - for tmp in LangSegmenter.getTexts(text,"ja"): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "all_ko": - for tmp in LangSegmenter.getTexts(text,"ko"): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "en": - langlist.append("en") - textlist.append(text) - elif language == "auto": - for tmp in LangSegmenter.getTexts(text): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - elif language == "auto_yue": - for tmp in LangSegmenter.getTexts(text): - if tmp["lang"] == "zh": - tmp["lang"] = "yue" - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) - else: - for tmp in LangSegmenter.getTexts(text): - if langlist: - if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"): - textlist[-1] += tmp["text"] - continue - if tmp["lang"] == "en": - langlist.append(tmp["lang"]) + if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + if language == "all_zh": + if re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "zh", version) else: - # 因无法区别中日韩文汉字,以用户输入为准 - langlist.append(language) - textlist.append(tmp["text"]) - phones_list = [] - bert_list = [] - norm_text_list = [] - for i in range(len(textlist)): - lang = langlist[i] - phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version) - bert = get_bert_inf(phones, word2ph, norm_text, lang) - phones_list.append(phones) - norm_text_list.append(norm_text) - bert_list.append(bert) - bert = torch.cat(bert_list, dim=1) - phones = sum(phones_list, []) - norm_text = "".join(norm_text_list) + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = get_bert_feature(norm_text, word2ph).to(device) + elif language == "all_yue" and re.search(r"[A-Za-z]", formattext): + formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext) + formattext = chinese.mix_text_normalize(formattext) + return get_phones_and_bert(formattext, "yue", version) + else: + phones, word2ph, norm_text = clean_text_inf(formattext, language, version) + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: + textlist = [] + langlist = [] + if language == "auto": + for tmp in LangSegmenter.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + elif language == "auto_yue": + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "zh": + tmp["lang"] = "yue" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegmenter.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日韩文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = "".join(norm_text_list) if not final and len(phones) < 6: return get_phones_and_bert("." + text, language, version, final=True) @@ -637,34 +564,23 @@ def __delattr__(self, item): raise AttributeError(f"Attribute {item} not found") -def get_spepc(hps, filename, dtype, device, is_v2pro=False): - sr1 = int(hps.data.sampling_rate) - audio, sr0 = torchaudio.load(filename) - if sr0 != sr1: - audio = audio.to(device) - if audio.shape[0] == 2: - audio = audio.mean(0).unsqueeze(0) - audio = resample(audio, sr0, sr1, device) - else: - audio = audio.to(device) - if audio.shape[0] == 2: - audio = audio.mean(0).unsqueeze(0) - +def get_spepc(hps, filename): + audio, _ = librosa.load(filename, sr=int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) maxx = audio.abs().max() if maxx > 1: audio /= min(2, maxx) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) spec = spectrogram_torch( - audio, + audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False, ) - spec = spec.to(dtype) - if is_v2pro == True: - audio = resample(audio, sr1, 16000, device).to(dtype) - return spec, audio + return spec def pack_audio(audio_bytes, data, rate): @@ -851,16 +767,6 @@ def get_tts_wav( t2s_model = infer_gpt.t2s_model max_sec = infer_gpt.max_sec - if version == "v3": - if sample_steps not in [4, 8, 16, 32, 64, 128]: - sample_steps = 32 - elif version == "v4": - if sample_steps not in [4, 8, 16, 32]: - sample_steps = 8 - - if if_sr and version != "v3": - if_sr = False - t0 = ttime() prompt_text = prompt_text.strip("\n") if prompt_text[-1] not in splits: @@ -884,29 +790,19 @@ def get_tts_wav( prompt_semantic = codes[0, 0] prompt = prompt_semantic.unsqueeze(0).to(device) - is_v2pro = version in {"v2Pro", "v2ProPlus"} - if version not in {"v3", "v4"}: + if version != "v3": refers = [] - if is_v2pro: - sv_emb = [] - if sv_cn_model == None: - init_sv_cn() if inp_refs: for path in inp_refs: - try: #####这里加上提取sv的逻辑,要么一堆sv一堆refer,要么单个sv单个refer - refer, audio_tensor = get_spepc(hps, path.name, dtype, device, is_v2pro) + try: + refer = get_spepc(hps, path).to(dtype).to(device) refers.append(refer) - if is_v2pro: - sv_emb.append(sv_cn_model.compute_embedding3(audio_tensor)) except Exception as e: logger.error(e) if len(refers) == 0: - refers, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device, is_v2pro) - refers = [refers] - if is_v2pro: - sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)] + refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)] else: - refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device) + refer = get_spepc(hps, ref_wav_path).to(device).to(dtype) t1 = ttime() # os.environ['version'] = version @@ -946,56 +842,41 @@ def get_tts_wav( pred_semantic = pred_semantic[:, -idx:].unsqueeze(0) t3 = ttime() - if version not in {"v3", "v4"}: - if is_v2pro: - audio = ( - vq_model.decode( - pred_semantic, - torch.LongTensor(phones2).to(device).unsqueeze(0), - refers, - speed=speed, - sv_emb=sv_emb, - ) - .detach() - .cpu() - .numpy()[0, 0] - ) - else: - audio = ( - vq_model.decode( - pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed - ) - .detach() - .cpu() - .numpy()[0, 0] - ) + if version != "v3": + audio = ( + vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 else: phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0) phoneme_ids1 = torch.LongTensor(phones2).to(device).unsqueeze(0) - + # print(11111111, phoneme_ids0, phoneme_ids1) fea_ref, ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer) ref_audio, sr = torchaudio.load(ref_wav_path) ref_audio = ref_audio.to(device).float() if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - - tgt_sr = 24000 if version == "v3" else 32000 - if sr != tgt_sr: - ref_audio = resample(ref_audio, sr, tgt_sr, device) - mel2 = mel_fn(ref_audio) if version == "v3" else mel_fn_v4(ref_audio) + if sr != 24000: + ref_audio = resample(ref_audio, sr) + # print("ref_audio",ref_audio.abs().mean()) + mel2 = mel_fn(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - Tref = 468 if version == "v3" else 500 - Tchunk = 934 if version == "v3" else 1000 - if T_min > Tref: - mel2 = mel2[:, :, -Tref:] - fea_ref = fea_ref[:, :, -Tref:] - T_min = Tref - chunk_len = Tchunk - T_min + if T_min > 468: + mel2 = mel2[:, :, -468:] + fea_ref = fea_ref[:, :, -468:] + T_min = 468 + chunk_len = 934 - T_min + # print("fea_ref",fea_ref,fea_ref.shape) + # print("mel2",mel2) mel2 = mel2.to(dtype) fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge, speed) + # print("fea_todo",fea_todo) + # print("ge",ge.abs().mean()) cfm_resss = [] idx = 0 while 1: @@ -1004,24 +885,22 @@ def get_tts_wav( break idx += chunk_len fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1) + # set_seed(123) cfm_res = vq_model.cfm.inference( fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0 ) cfm_res = cfm_res[:, :, mel2.shape[2] :] mel2 = cfm_res[:, :, -T_min:] + # print("fea", fea) + # print("mel2in", mel2) fea_ref = fea_todo_chunk[:, :, -T_min:] cfm_resss.append(cfm_res) - cfm_res = torch.cat(cfm_resss, 2) - cfm_res = denorm_spec(cfm_res) - if version == "v3": - if bigvgan_model == None: - init_bigvgan() - else: # v4 - if hifigan_model == None: - init_hifigan() - vocoder_model = bigvgan_model if version == "v3" else hifigan_model + cmf_res = torch.cat(cfm_resss, 2) + cmf_res = denorm_spec(cmf_res) + if bigvgan_model == None: + init_bigvgan() with torch.inference_mode(): - wav_gen = vocoder_model(cfm_res) + wav_gen = bigvgan_model(cmf_res) audio = wav_gen[0][0].cpu().detach().numpy() max_audio = np.abs(audio).max() @@ -1032,13 +911,7 @@ def get_tts_wav( audio_opt = np.concatenate(audio_opt, 0) t4 = ttime() - if version in {"v1", "v2", "v2Pro", "v2ProPlus"}: - sr = 32000 - elif version == "v3": - sr = 24000 - else: - sr = 48000 # v4 - + sr = hps.data.sampling_rate if version != "v3" else 24000 if if_sr and sr == 24000: audio_opt = torch.from_numpy(audio_opt).float().to(device) audio_opt, sr = audio_sr(audio_opt.unsqueeze(0), sr) @@ -1058,12 +931,8 @@ def get_tts_wav( if not stream_mode == "normal": if media_type == "wav": - if version in {"v1", "v2", "v2Pro", "v2ProPlus"}: - sr = 32000 - elif version == "v3": - sr = 48000 if if_sr else 24000 - else: - sr = 48000 # v4 + sr = 48000 if if_sr else 24000 + sr = hps.data.sampling_rate if version != "v3" else sr audio_bytes = pack_wav(audio_bytes, sr) yield audio_bytes.getvalue() @@ -1128,6 +997,9 @@ def handle( if not default_refer.is_ready(): return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) + if sample_steps not in [4, 8, 16, 32]: + sample_steps = 32 + if cut_punc == None: text = cut_text(text, default_cut_punc) else: @@ -1230,10 +1102,10 @@ def handle( # 模型路径检查 if sovits_path == "": sovits_path = g_config.pretrained_sovits_path - logger.warning(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}") + logger.warn(f"未指定SoVITS模型路径, fallback后当前值: {sovits_path}") if gpt_path == "": gpt_path = g_config.pretrained_gpt_path - logger.warning(f"未指定GPT模型路径, fallback后当前值: {gpt_path}") + logger.warn(f"未指定GPT模型路径, fallback后当前值: {gpt_path}") # 指定默认参考音频, 调用方 未提供/未给全 参考音频参数时使用 if default_refer.path == "" or default_refer.text == "" or default_refer.language == "": @@ -1341,20 +1213,22 @@ async def change_refer(refer_wav_path: str = None, prompt_text: str = None, prom @app.post("/") async def tts_endpoint(request: Request): json_post_raw = await request.json() + # Normalize parameters to support both v1 and v2 naming conventions + params = normalize_api_params(json_post_raw) return handle( - json_post_raw.get("refer_wav_path"), - json_post_raw.get("prompt_text"), - json_post_raw.get("prompt_language"), - json_post_raw.get("text"), - json_post_raw.get("text_language"), - json_post_raw.get("cut_punc"), - json_post_raw.get("top_k", 15), - json_post_raw.get("top_p", 1.0), - json_post_raw.get("temperature", 1.0), - json_post_raw.get("speed", 1.0), - json_post_raw.get("inp_refs", []), - json_post_raw.get("sample_steps", 32), - json_post_raw.get("if_sr", False), + params.get("refer_wav_path"), + params.get("prompt_text"), + params.get("prompt_language"), + params.get("text"), + params.get("text_language"), + params.get("cut_punc"), + params.get("top_k", 15), + params.get("top_p", 1.0), + params.get("temperature", 1.0), + params.get("speed", 1.0), + params.get("inp_refs", []), + params.get("sample_steps", 32), + params.get("if_sr", False), ) diff --git a/tools/i18n/locale/en_US.json b/tools/i18n/locale/en_US.json index 561d3bfd3..dbd51d3cd 100644 --- a/tools/i18n/locale/en_US.json +++ b/tools/i18n/locale/en_US.json @@ -18,7 +18,6 @@ "ASR 模型": "ASR model", "ASR 模型尺寸": "ASR model size", "ASR 语言设置": "ASR language", - "CPU训练,较慢": "Training on CPU (slower)", "GPT 训练: 模型权重文件在 GPT_weights/": "GPT Training: Model Weights saved in GPT_weights/", "GPT模型列表": "GPT weight list", "GPT训练": "GPT Training", @@ -26,19 +25,18 @@ "GPU卡号,只能填1个整数": "GPU number, can only input ONE integer", "GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ", "LoRA秩": "LoRA Rank", + "SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained SoVITS V3 Model, Cannot Load LoRA Weights", "SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/", "SoVITS模型列表": "SoVITS weight list", "SoVITS训练": "SoVITS Training", - "Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮,你再翻回来就回滚了,白忙活。)": "Submit Text: Manually save all text box contents on the current page to memory and file (If you don't click this button before switching pages or exiting the labeling page, the data will be rolled back when you return, which would be a waste of work.)", "TTS推理WebUI": "TTS Inference WebUI", "UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)", - "V3不支持无参考文本模式,请填写参考文本!": "V3 does not support the no-reference-text mode. Please provide reference text!", "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset", "batch_size": "Batch Size", "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)", "max:归一化后最大值多少": "Loudness multiplier after normalized", "max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept", - "min_interval:最短切割间隔": "Minimum interval for audio cutting", + "min_interval:最短切割间隔": "Minumum interval for audio cutting", "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value", "temperature": "temperature", "threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise", @@ -47,11 +45,8 @@ "v3暂不支持该模式,使用了会报错。": "v3 does not support this mode currently, using it will cause an error.", "v3输出如果觉得闷可以试试开超分": "For V3 model, if generated audio sounds somewhat muffled, try enable audio super-resolution.", "不切": "No slice", - "不训练直接推v2ProPlus底模!": "Use v2ProPlus base model directly without training!", - "不训练直接推v2Pro底模!": "Use v2Pro base model directly without training!", - "不训练直接推v2底模!": "Use v2 base model directly without training!", - "不训练直接推v3底模!": "Use v3 base model directly without training!", "中文": "Chinese", + "中文教程文档": "Chinese Tutorial", "中英混合": "Chinese-English Mixed", "主参考音频(请上传3~10秒内参考音频,超过会报错!)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)", "主参考音频的文本": "Text of Primary Reference Audio", @@ -92,7 +87,6 @@ "句间停顿秒数": "Pause Duration between Sentences (Seconds)", "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.", "合成语音": "Start inference", - "合成音频": "Synthesize Audio", "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).", "后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.", "听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.", @@ -110,15 +104,11 @@ "已关闭": " is Closed", "已完成": " Finished", "已开启": " is Opened", - "并行合成中": "Parallel Synthesis in Progress", "并行推理": "Parallel Inference", "并行推理模式已关闭": "Parallel Inference Mode Disabled", "并行推理模式已开启": "Parallel Inference Mode Enabled", - "底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained Model, Cannot Load LoRA Weights", "开启": "Open ", "开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.", - "当开启并行推理模式时,SoVits V3/4模型不支持分桶处理,已自动关闭分桶处理": "When parallel inference mode is enabled, SoVITS V3/4 models do not support bucket processing; bucket processing has been automatically disabled.", - "微调模型信息": "Fine-tuned Model Information", "微调训练": "Fine-Tuning", "怎么切": "How to slice the sentence", "总训练轮数total_epoch": "Total training epochs (total_epoch):", @@ -150,8 +140,8 @@ "模型": "Model", "模型分为三类:": "Models are categorized into three types:", "模型切换": "Model switch", - "模型加载中,请等待": "Model is loading, please wait...", "每张显卡的batch_size": "Batch size per GPU:", + "版本": "Version", "粤英混合": "Yue-English Mixed", "粤语": "Yue", "终止合成": "Terminate Synthesis", @@ -160,7 +150,6 @@ "缺少音素数据集": "Missing Phoneme Dataset", "缺少音频数据集": "Missing Audio Dataset", "英文": "English", - "训练模型的版本": "Version of the trained model", "训练集格式化一键三连": "Training Set One-Click Formatting", "训练集格式化工具": "Dataset Formatting Tool", "语义Token提取": "Semantics Token Extraction", @@ -174,9 +163,10 @@ "语音识别": "Speech Recognition", "语音识别工具": "Speech Recognition Tool", "语音降噪": "Speech Denoising", + "语音降噪工具": "Speech Denoising Tool", "请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.", "请上传参考音频": "Please Upload the Reference Audio", - "请填入推理文本": "Please Fill in the Target Text", + "请填入推理文本": "Please Fill in the Terget Text", "请填入正确的List路径": "Please Fill in the Correct List Path", "请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path", "请输入有效文本": "Please enter valid text.", @@ -197,8 +187,7 @@ "进度": "Progress", "进程已终止": " Process Terminated", "进程输出信息": " Process Output Information", - "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模,体验5秒Zero Shot TTS不训练推理用。": "Select the model from SoVITS_weights and GPT_weights. The default models are pretrained models for experiencing 5-second Zero-Shot TTS without training.", - "采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)", + "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.", "采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing", "重复惩罚": "Repetition Penalty", "随机种子": "Random Seed", @@ -214,13 +203,29 @@ "音频标注WebUI": "Audio Labeling WebUI", "音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)", "音频超分中": "Running Audio Super-Resolution", - "音频超采样": "Audio Upsampling", - "音频超采样(仅对V3生效))": "Audio Upsampling (V3 Only)", - "预测语义Token": "Predict Semantic Token", "预训练GPT模型路径": "Pretrained GPT Model Path", "预训练SSL模型路径": "Pretrained SSL Model Path", "预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path", "预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path", "预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path", - "预训练模型路径": "Pretrained Model Path" -} + "参数预设": "Preset", + "选择预设可快速配置推理参数": "Select a preset to quickly configure inference parameters", + "快速合成": "Fast Synthesis", + "高质量": "High Quality", + "平衡": "Balanced", + "自定义": "Custom", + "请输入包含音频文件的文件夹路径": "Please enter the folder path containing audio files", + "请输入 .list 标注文件的完整路径": "Please enter the full path to the .list annotation file", + "音频超采样(仅对V3生效))": "Audio Super-Sampling (V3 Only)", + "采样步数(仅对V3/4生效)": "Sampling Steps (V3/V4 Only)", + "选择文件/文件夹": "Select File/Folder", + "选择输出目录(选择其中任意文件)": "Select Output Directory (Select any file inside)", + "选择输入目录": "Select Input Directory", + "选择输出目录": "Select Output Directory", + "选择标注文件": "Select Annotation File", + "选择音频目录": "Select Audio Directory", + "选择文件夹": "Select Folder", + "选择文件": "Select File", + "📁 选择文件夹": "📁 Select Folder", + "📄 选择文件": "📄 Select File" +} \ No newline at end of file diff --git a/webui.py b/webui.py index 9c5ddd077..85d757665 100644 --- a/webui.py +++ b/webui.py @@ -1,7 +1,10 @@ import os import sys -os.environ["version"] = version = "v2Pro" +if len(sys.argv) == 1: + sys.argv.append("v2") +version = "v1" if sys.argv[1] == "v1" else "v2" +os.environ["version"] = version now_dir = os.getcwd() sys.path.insert(0, now_dir) import warnings @@ -9,6 +12,7 @@ warnings.filterwarnings("ignore") import json import platform +import re import shutil import signal @@ -60,23 +64,11 @@ import subprocess from subprocess import Popen -from tools.assets import css, js, top_html -from tools.i18n.i18n import I18nAuto, scan_language_list - -language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" -os.environ["language"] = language -i18n = I18nAuto(language=language) -from multiprocessing import cpu_count - from config import ( - GPU_INDEX, - GPU_INFOS, - IS_GPU, exp_root, infer_device, is_half, is_share, - memset, python_exec, webui_port_infer_tts, webui_port_main, @@ -84,19 +76,76 @@ webui_port_uvr5, ) from tools import my_utils -from tools.my_utils import check_details, check_for_existance +from tools.i18n.i18n import I18nAuto, scan_language_list + +language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto" +os.environ["language"] = language +i18n = I18nAuto(language=language) +from multiprocessing import cpu_count -os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" +from tools.my_utils import check_details, check_for_existance # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu +try: + import gradio.analytics as analytics + + analytics.version_check = lambda: None +except: + ... import gradio as gr n_cpu = cpu_count() -set_gpu_numbers = GPU_INDEX -gpu_infos = GPU_INFOS -mem = memset -is_gpu_ok = IS_GPU +ngpu = torch.cuda.device_count() +gpu_infos = [] +mem = [] +if_gpu_ok = False + +# 判断是否有能用来训练和加速推理的N卡 +ok_gpu_keywords = { + "10", + "16", + "20", + "30", + "40", + "A2", + "A3", + "A4", + "P4", + "A50", + "500", + "A60", + "70", + "80", + "90", + "M4", + "T4", + "TITAN", + "L4", + "4060", + "H", + "600", + "506", + "507", + "508", + "509", +} +set_gpu_numbers = set() +if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + if any(value in gpu_name.upper() for value in ok_gpu_keywords): + # A10#A100#V100#A40#P40#M40#K80#A4500 + if_gpu_ok = True # 至少有一张能用的N卡 + gpu_infos.append("%s\t%s" % (i, gpu_name)) + set_gpu_numbers.add(i) + mem.append(int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)) +# # 判断是否支持mps加速 +# if torch.backends.mps.is_available(): +# if_gpu_ok = True +# gpu_infos.append("%s\t%s" % ("0", "Apple GPU")) +# mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 + v3v4set = {"v3", "v4"} @@ -113,12 +162,34 @@ def set_default(): default_batch_size_s1, \ if_force_ckpt if_force_ckpt = False - gpu_info = "\n".join(gpu_infos) - if is_gpu_ok: + if if_gpu_ok and len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) minmem = min(mem) - default_batch_size = int(minmem // 2 if version not in v3v4set else minmem // 8) - default_batch_size_s1 = int(minmem // 2) + # if version == "v3" and minmem < 14: + # # API读取不到共享显存,直接填充确认 + # try: + # torch.zeros((1024,1024,1024,14),dtype=torch.int8,device="cuda") + # torch.cuda.empty_cache() + # minmem = 14 + # except RuntimeError as _: + # # 强制梯度检查只需要12G显存 + # if minmem >= 12 : + # if_force_ckpt = True + # minmem = 14 + # else: + # try: + # torch.zeros((1024,1024,1024,12),dtype=torch.int8,device="cuda") + # torch.cuda.empty_cache() + # if_force_ckpt = True + # minmem = 14 + # except RuntimeError as _: + # print("显存不足以开启V3训练") + default_batch_size = minmem // 2 if version not in v3v4set else minmem // 8 + default_batch_size_s1 = minmem // 2 else: + gpu_info = "%s\t%s" % ("0", "CPU") + gpu_infos.append("%s\t%s" % ("0", "CPU")) + set_gpu_numbers.add(0) default_batch_size = default_batch_size_s1 = int(psutil.virtual_memory().total / 1024 / 1024 / 1024 / 4) if version not in v3v4set: default_sovits_epoch = 8 @@ -138,8 +209,8 @@ def set_default(): set_default() -gpus = "-".join(map(str, GPU_INDEX)) -default_gpu_numbers = infer_device.index +gpus = "-".join([i[0] for i in gpu_infos]) +default_gpu_numbers = str(sorted(list(set_gpu_numbers))[0]) def fix_gpu_number(input): # 将越界的number强制改到界内 @@ -161,45 +232,86 @@ def fix_gpu_numbers(inputs): return inputs -from config import pretrained_gpt_name, pretrained_sovits_name - - -def check_pretrained_is_exist(version): - pretrained_model_list = ( - pretrained_sovits_name[version], - pretrained_sovits_name[version].replace("s2G", "s2D"), - pretrained_gpt_name[version], - "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - "GPT_SoVITS/pretrained_models/chinese-hubert-base", - ) - _ = "" - for i in pretrained_model_list: - if "s2Dv3" not in i and "s2Dv4" not in i and os.path.exists(i) == False: - _ += f"\n {i}" - if _: - print("warning: ", i18n("以下模型不存在:") + _) - - -check_pretrained_is_exist(version) -for key in pretrained_sovits_name.keys(): - if os.path.exists(pretrained_sovits_name[key]) == False: - pretrained_sovits_name[key] = "" -for key in pretrained_gpt_name.keys(): - if os.path.exists(pretrained_gpt_name[key]) == False: - pretrained_gpt_name[key] = "" - -from config import ( - GPT_weight_root, - GPT_weight_version2root, - SoVITS_weight_root, - SoVITS_weight_version2root, - change_choices, - get_weights_names, +pretrained_sovits_name = [ + "GPT_SoVITS/pretrained_models/s2G488k.pth", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", + "GPT_SoVITS/pretrained_models/s2Gv3.pth", + "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth", +] +pretrained_gpt_name = [ + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", + "GPT_SoVITS/pretrained_models/s1v3.ckpt", +] + +pretrained_model_list = ( + pretrained_sovits_name[int(version[-1]) - 1], + pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + pretrained_gpt_name[int(version[-1]) - 1], + "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", + "GPT_SoVITS/pretrained_models/chinese-hubert-base", ) +_ = "" +for i in pretrained_model_list: + if "s2Dv3" not in i and os.path.exists(i) == False: + _ += f"\n {i}" +if _: + print("warning: ", i18n("以下模型不存在:") + _) + +_ = [[], []] +for i in range(4): + if os.path.exists(pretrained_gpt_name[i]): + _[0].append(pretrained_gpt_name[i]) + else: + _[0].append("") ##没有下pretrained模型的,说不定他们是想自己从零训底模呢 + if os.path.exists(pretrained_sovits_name[i]): + _[-1].append(pretrained_sovits_name[i]) + else: + _[-1].append("") +pretrained_gpt_name, pretrained_sovits_name = _ + +SoVITS_weight_root = ["SoVITS_weights", "SoVITS_weights_v2", "SoVITS_weights_v3", "SoVITS_weights_v4"] +GPT_weight_root = ["GPT_weights", "GPT_weights_v2", "GPT_weights_v3", "GPT_weights_v4"] for root in SoVITS_weight_root + GPT_weight_root: os.makedirs(root, exist_ok=True) + + +def get_weights_names(): + SoVITS_names = [name for name in pretrained_sovits_name if name != ""] + for path in SoVITS_weight_root: + for name in os.listdir(path): + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (path, name)) + GPT_names = [name for name in pretrained_gpt_name if name != ""] + for path in GPT_weight_root: + for name in os.listdir(path): + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (path, name)) + return SoVITS_names, GPT_names + + SoVITS_names, GPT_names = get_weights_names() +for path in SoVITS_weight_root + GPT_weight_root: + os.makedirs(path, exist_ok=True) + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split("(\d+)", s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names() + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, { + "choices": sorted(GPT_names, key=custom_sort_key), + "__type__": "update", + } + p_label = None p_uvr5 = None @@ -264,6 +376,141 @@ def process_info(process_name="", indicator=""): return process_name +# ================================ +# 文件/文件夹选择辅助函数(使用 tkinter 原生对话框) +# ================================ +import threading + +def open_folder_dialog(current_path=""): + """ + 打开系统原生文件夹选择对话框。 + 在单独的线程中运行 tkinter 以避免阻塞 Gradio。 + """ + result = {"path": current_path} + + def _open_dialog(): + try: + import tkinter as tk + from tkinter import filedialog + + root = tk.Tk() + root.withdraw() # 隐藏主窗口 + root.attributes('-topmost', True) # 置顶 + + # 设置初始目录 + initial_dir = current_path if current_path and os.path.isdir(current_path) else now_dir + + folder_path = filedialog.askdirectory( + title=i18n("选择文件夹"), + initialdir=initial_dir + ) + + root.destroy() + + if folder_path: + result["path"] = folder_path + except Exception as e: + print(f"Folder dialog error: {e}") + + # 在主线程运行 tkinter(必须) + _open_dialog() + return result["path"] + + +def open_file_dialog(current_path="", file_types=None): + """ + 打开系统原生文件选择对话框。 + file_types: 文件类型过滤器,例如 [("List files", "*.list"), ("All files", "*.*")] + """ + result = {"path": current_path} + + def _open_dialog(): + try: + import tkinter as tk + from tkinter import filedialog + + root = tk.Tk() + root.withdraw() + root.attributes('-topmost', True) + + # 设置初始目录 + initial_dir = os.path.dirname(current_path) if current_path and os.path.exists(os.path.dirname(current_path)) else now_dir + + # 默认文件类型 + if file_types is None: + filetypes = [("All files", "*.*")] + else: + filetypes = file_types + + file_path = filedialog.askopenfilename( + title=i18n("选择文件"), + initialdir=initial_dir, + filetypes=filetypes + ) + + root.destroy() + + if file_path: + result["path"] = file_path + except Exception as e: + print(f"File dialog error: {e}") + + _open_dialog() + return result["path"] + + +def open_list_file_dialog(current_path=""): + """ + 打开文件选择对话框,专门用于选择 .list 文件。 + """ + return open_file_dialog( + current_path, + file_types=[("List files", "*.list"), ("Text files", "*.txt"), ("All files", "*.*")] + ) + + +def open_audio_file_dialog(current_path=""): + """ + 打开文件选择对话框,专门用于选择音频文件。 + """ + return open_file_dialog( + current_path, + file_types=[ + ("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), + ("WAV files", "*.wav"), + ("MP3 files", "*.mp3"), + ("All files", "*.*") + ] + ) + + +# 保留原有函数用于兼容 File 组件 +def get_file_path(file_obj): + """从 Gradio File 组件获取文件路径。""" + if file_obj is None: + return "" + if isinstance(file_obj, str): + return file_obj + if hasattr(file_obj, 'name'): + return file_obj.name + return "" + + +def get_folder_from_file(file_obj): + """从上传的文件推断其所在文件夹的路径。""" + if file_obj is None: + return "" + file_path = "" + if isinstance(file_obj, str): + file_path = file_obj + elif hasattr(file_obj, 'name'): + file_path = file_obj.name + + if file_path: + return os.path.dirname(file_path) + return "" + + process_name_subfix = i18n("音频标注WebUI") @@ -338,11 +585,11 @@ def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, so # if version=="v3": # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) if p_tts_inference is None: - os.environ["gpt_path"] = gpt_path - os.environ["sovits_path"] = sovits_path + os.environ["gpt_path"] = gpt_path if "/" in gpt_path else "%s/%s" % (GPT_weight_root, gpt_path) + os.environ["sovits_path"] = sovits_path if "/" in sovits_path else "%s/%s" % (SoVITS_weight_root, sovits_path) os.environ["cnhubert_base_path"] = cnhubert_base_path os.environ["bert_path"] = bert_path - os.environ["_CUDA_VISIBLE_DEVICES"] = str(fix_gpu_number(gpu_number)) + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_number(gpu_number) os.environ["is_half"] = str(is_half) os.environ["infer_ttswebui"] = str(webui_port_infer_tts) os.environ["is_share"] = str(is_share) @@ -487,7 +734,6 @@ def close_denoise(): def open1Ba( - version, batch_size, total_epoch, exp_name, @@ -503,13 +749,7 @@ def open1Ba( ): global p_train_SoVITS if p_train_SoVITS == None: - exp_name = exp_name.rstrip(" ") - config_file = ( - "GPT_SoVITS/configs/s2.json" - if version not in {"v2Pro", "v2ProPlus"} - else f"GPT_SoVITS/configs/s2{version}.json" - ) - with open(config_file) as f: + with open("GPT_SoVITS/configs/s2.json") as f: data = f.read() data = json.loads(data) s2_dir = "%s/%s" % (exp_root, exp_name) @@ -532,13 +772,13 @@ def open1Ba( data["train"]["lora_rank"] = lora_rank data["model"]["version"] = version data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir - data["save_weight_dir"] = SoVITS_weight_version2root[version] + data["save_weight_dir"] = SoVITS_weight_root[int(version[-1]) - 1] data["name"] = exp_name data["version"] = version tmp_config_path = "%s/tmp_s2.json" % tmp with open(tmp_config_path, "w") as f: f.write(json.dumps(data)) - if version in ["v1", "v2", "v2Pro", "v2ProPlus"]: + if version in ["v1", "v2"]: cmd = '"%s" -s GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) else: cmd = '"%s" -s GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) @@ -600,7 +840,6 @@ def open1Bb( ): global p_train_GPT if p_train_GPT == None: - exp_name = exp_name.rstrip(" ") with open( "GPT_SoVITS/configs/s1longer.yaml" if version == "v1" else "GPT_SoVITS/configs/s1longer-v2.yaml" ) as f: @@ -620,14 +859,14 @@ def open1Bb( data["train"]["if_save_every_weights"] = if_save_every_weights data["train"]["if_save_latest"] = if_save_latest data["train"]["if_dpo"] = if_dpo - data["train"]["half_weights_save_dir"] = GPT_weight_version2root[version] + data["train"]["half_weights_save_dir"] = GPT_weight_root[int(version[-1]) - 1] data["train"]["exp_name"] = exp_name data["train_semantic_path"] = "%s/6-name2semantic.tsv" % s1_dir data["train_phoneme_path"] = "%s/2-name2text.txt" % s1_dir data["output_dir"] = "%s/logs_s1_%s" % (s1_dir, version) # data["version"]=version - os.environ["_CUDA_VISIBLE_DEVICES"] = str(fix_gpu_numbers(gpu_numbers.replace("-", ","))) + os.environ["_CUDA_VISIBLE_DEVICES"] = fix_gpu_numbers(gpu_numbers.replace("-", ",")) os.environ["hz"] = "25hz" tmp_config_path = "%s/tmp_s1.yaml" % tmp with open(tmp_config_path, "w") as f: @@ -783,7 +1022,6 @@ def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): inp_wav_dir = my_utils.clean_path(inp_wav_dir) if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): check_details([inp_text, inp_wav_dir], is_dataset_processing=True) - exp_name = exp_name.rstrip(" ") if ps1a == []: opt_dir = "%s/%s" % (exp_root, exp_name) config = { @@ -800,7 +1038,7 @@ def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), "is_half": str(is_half), } ) @@ -862,18 +1100,16 @@ def close1a(): ) -sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt" ps1b = [] process_name_1b = i18n("语音自监督特征提取") -def open1b(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): +def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): global ps1b inp_text = my_utils.clean_path(inp_text) inp_wav_dir = my_utils.clean_path(inp_wav_dir) if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): check_details([inp_text, inp_wav_dir], is_dataset_processing=True) - exp_name = exp_name.rstrip(" ") if ps1b == []: config = { "inp_text": inp_text, @@ -881,7 +1117,6 @@ def open1b(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained "exp_name": exp_name, "opt_dir": "%s/%s" % (exp_root, exp_name), "cnhubert_base_dir": ssl_pretrained_dir, - "sv_path": sv_path, "is_half": str(is_half), } gpu_names = gpu_numbers.split("-") @@ -891,7 +1126,7 @@ def open1b(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), } ) os.environ.update(config) @@ -907,23 +1142,6 @@ def open1b(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained for p in ps1b: p.wait() ps1b = [] - if "Pro" in version: - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), - } - ) - os.environ.update(config) - cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-sv.py' % python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1b.append(p) - for p in ps1b: - p.wait() - ps1b = [] yield ( process_info(process_name_1b, "finish"), {"__type__": "update", "visible": True}, @@ -957,25 +1175,19 @@ def close1b(): process_name_1c = i18n("语义Token提取") -def open1c(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, pretrained_s2G_path): +def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): global ps1c inp_text = my_utils.clean_path(inp_text) - if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): - check_details([inp_text, inp_wav_dir], is_dataset_processing=True) - exp_name = exp_name.rstrip(" ") + if check_for_existance([inp_text, ""], is_dataset_processing=True): + check_details([inp_text, ""], is_dataset_processing=True) if ps1c == []: opt_dir = "%s/%s" % (exp_root, exp_name) - config_file = ( - "GPT_SoVITS/configs/s2.json" - if version not in {"v2Pro", "v2ProPlus"} - else f"GPT_SoVITS/configs/s2{version}.json" - ) config = { "inp_text": inp_text, "exp_name": exp_name, "opt_dir": opt_dir, "pretrained_s2G": pretrained_s2G_path, - "s2config_path": config_file, + "s2config_path": "GPT_SoVITS/configs/s2.json", "is_half": str(is_half), } gpu_names = gpu_numbers.split("-") @@ -985,7 +1197,7 @@ def open1c(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, pretrained_s2G { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), } ) os.environ.update(config) @@ -1044,7 +1256,6 @@ def close1c(): def open1abc( - version, inp_text, inp_wav_dir, exp_name, @@ -1060,7 +1271,6 @@ def open1abc( inp_wav_dir = my_utils.clean_path(inp_wav_dir) if check_for_existance([inp_text, inp_wav_dir], is_dataset_processing=True): check_details([inp_text, inp_wav_dir], is_dataset_processing=True) - exp_name = exp_name.rstrip(" ") if ps1abc == []: opt_dir = "%s/%s" % (exp_root, exp_name) try: @@ -1085,7 +1295,7 @@ def open1abc( { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), } ) os.environ.update(config) @@ -1123,7 +1333,6 @@ def open1abc( "exp_name": exp_name, "opt_dir": opt_dir, "cnhubert_base_dir": ssl_pretrained_dir, - "sv_path": sv_path, } gpu_names = gpu_numbers1Ba.split("-") all_parts = len(gpu_names) @@ -1132,7 +1341,7 @@ def open1abc( { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), } ) os.environ.update(config) @@ -1147,45 +1356,23 @@ def open1abc( ) for p in ps1abc: p.wait() - ps1abc = [] - if "Pro" in version: - for i_part in range(all_parts): - config.update( - { - "i_part": str(i_part), - "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), - } - ) - os.environ.update(config) - cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-sv.py' % python_exec - print(cmd) - p = Popen(cmd, shell=True) - ps1abc.append(p) - for p in ps1abc: - p.wait() - ps1abc = [] yield ( i18n("进度") + ": 1A-Done, 1B-Done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, ) + ps1abc = [] #############################1c path_semantic = "%s/6-name2semantic.tsv" % opt_dir if os.path.exists(path_semantic) == False or ( os.path.exists(path_semantic) == True and os.path.getsize(path_semantic) < 31 ): - config_file = ( - "GPT_SoVITS/configs/s2.json" - if version not in {"v2Pro", "v2ProPlus"} - else f"GPT_SoVITS/configs/s2{version}.json" - ) config = { "inp_text": inp_text, "exp_name": exp_name, "opt_dir": opt_dir, "pretrained_s2G": pretrained_s2G_path, - "s2config_path": config_file, + "s2config_path": "GPT_SoVITS/configs/s2.json", } gpu_names = gpu_numbers1c.split("-") all_parts = len(gpu_names) @@ -1194,7 +1381,7 @@ def open1abc( { "i_part": str(i_part), "all_parts": str(all_parts), - "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])), + "_CUDA_VISIBLE_DEVICES": fix_gpu_number(gpu_names[i_part]), } ) os.environ.update(config) @@ -1265,17 +1452,17 @@ def switch_version(version_): os.environ["version"] = version_ global version version = version_ - if pretrained_sovits_name[version] != "" and pretrained_gpt_name[version] != "": + if pretrained_sovits_name[int(version[-1]) - 1] != "" and pretrained_gpt_name[int(version[-1]) - 1] != "": ... else: gr.Warning(i18n("未下载模型") + ": " + version.upper()) set_default() return ( - {"__type__": "update", "value": pretrained_sovits_name[version]}, - {"__type__": "update", "value": pretrained_sovits_name[version].replace("s2G", "s2D")}, - {"__type__": "update", "value": pretrained_gpt_name[version]}, - {"__type__": "update", "value": pretrained_gpt_name[version]}, - {"__type__": "update", "value": pretrained_sovits_name[version]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D")}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_gpt_name[int(version[-1]) - 1]}, + {"__type__": "update", "value": pretrained_sovits_name[int(version[-1]) - 1]}, {"__type__": "update", "value": default_batch_size, "maximum": default_max_batch_size}, {"__type__": "update", "value": default_sovits_epoch, "maximum": max_sovits_epoch}, {"__type__": "update", "value": default_sovits_save_every_epoch, "maximum": max_sovits_save_every_epoch}, @@ -1302,91 +1489,89 @@ def sync(text): return {"__type__": "update", "value": text} -with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app: - gr.HTML( - top_html.format( - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") - + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ), - elem_classes="markdown", +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + + "
" + + i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") ) + gr.Markdown(value=i18n("中文教程文档") + ": " + "https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e") with gr.Tabs(): with gr.TabItem("0-" + i18n("前置数据集获取工具")): # 提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标 - with gr.Accordion(label="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")): - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info")) - open_uvr5 = gr.Button( - value=process_info(process_name_uvr5, "open"), variant="primary", visible=True - ) - close_uvr5 = gr.Button( - value=process_info(process_name_uvr5, "close"), variant="primary", visible=False - ) + gr.Markdown(value="0a-" + i18n("UVR5人声伴奏分离&去混响去延迟工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + uvr5_info = gr.Textbox(label=process_info(process_name_uvr5, "info")) + open_uvr5 = gr.Button(value=process_info(process_name_uvr5, "open"), variant="primary", visible=True) + close_uvr5 = gr.Button(value=process_info(process_name_uvr5, "close"), variant="primary", visible=False) - with gr.Accordion(label="0b-" + i18n("语音切分工具")): - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"), value="") - slice_opt_root = gr.Textbox( - label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt" - ) - with gr.Row(): - threshold = gr.Textbox( - label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34" - ) - min_length = gr.Textbox( - label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"), - value="4000", - ) - min_interval = gr.Textbox(label=i18n("min_interval:最短切割间隔"), value="300") - hop_size = gr.Textbox( - label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"), - value="10", - ) - max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500") - with gr.Row(): - _max = gr.Slider( - minimum=0, - maximum=1, - step=0.05, - label=i18n("max:归一化后最大值多少"), - value=0.9, - interactive=True, - ) - alpha = gr.Slider( - minimum=0, - maximum=1, - step=0.05, - label=i18n("alpha_mix:混多少比例归一化后音频进来"), - value=0.25, - interactive=True, - ) - with gr.Row(): - n_process = gr.Slider( - minimum=1, - maximum=n_cpu, - step=1, - label=i18n("切割使用的进程数"), - value=4, - interactive=True, - ) - slicer_info = gr.Textbox(label=process_info(process_name_slice, "info")) - open_slicer_button = gr.Button( - value=process_info(process_name_slice, "open"), variant="primary", visible=True - ) - close_slicer_button = gr.Button( - value=process_info(process_name_slice, "close"), variant="primary", visible=False - ) + gr.Markdown(value="0b-" + i18n("语音切分工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + slice_inp_path = gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"), value="", scale=6) + slice_inp_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + with gr.Row(): + slice_opt_root = gr.Textbox(label=i18n("切分后的子音频的输出根目录"), value="output/slicer_opt", scale=6) + slice_opt_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + # 按钮点击事件绑定 - 使用原生对话框 + slice_inp_btn.click(open_folder_dialog, [slice_inp_path], [slice_inp_path]) + slice_opt_btn.click(open_folder_dialog, [slice_opt_root], [slice_opt_root]) + with gr.Row(): + threshold = gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"), value="-34") + min_length = gr.Textbox( + label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"), + value="4000", + ) + min_interval = gr.Textbox(label=i18n("min_interval:最短切割间隔"), value="300") + hop_size = gr.Textbox( + label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"), + value="10", + ) + max_sil_kept = gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"), value="500") + with gr.Row(): + _max = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("max:归一化后最大值多少"), + value=0.9, + interactive=True, + ) + alpha = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("alpha_mix:混多少比例归一化后音频进来"), + value=0.25, + interactive=True, + ) + with gr.Row(): + n_process = gr.Slider( + minimum=1, maximum=n_cpu, step=1, label=i18n("切割使用的进程数"), value=4, interactive=True + ) + slicer_info = gr.Textbox(label=process_info(process_name_slice, "info")) + open_slicer_button = gr.Button( + value=process_info(process_name_slice, "open"), variant="primary", visible=True + ) + close_slicer_button = gr.Button( + value=process_info(process_name_slice, "close"), variant="primary", visible=False + ) - # gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(不稳定,先别用,可能劣化模型效果!)")) - with gr.Row(visible=False): + gr.Markdown(value="0bb-" + i18n("语音降噪工具")+i18n("(不稳定,先别用,可能劣化模型效果!)")) + with gr.Row(): with gr.Column(scale=3): with gr.Row(): - denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="") - denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt") + denoise_input_dir = gr.Textbox(label=i18n("输入文件夹路径"), value="", scale=6) + denoise_input_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + with gr.Row(): + denoise_output_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/denoise_opt", scale=6) + denoise_output_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + # 按钮点击事件绑定 + denoise_input_btn.click(open_folder_dialog, [denoise_input_dir], [denoise_input_dir]) + denoise_output_btn.click(open_folder_dialog, [denoise_output_dir], [denoise_output_dir]) with gr.Row(): denoise_info = gr.Textbox(label=process_info(process_name_denoise, "info")) open_denoise_button = gr.Button( @@ -1396,40 +1581,46 @@ def sync(text): value=process_info(process_name_denoise, "close"), variant="primary", visible=False ) - with gr.Accordion(label="0c-" + i18n("语音识别工具")): - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - asr_inp_dir = gr.Textbox( - label=i18n("输入文件夹路径"), value="D:\\GPT-SoVITS\\raw\\xxx", interactive=True - ) - asr_opt_dir = gr.Textbox( - label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True - ) - with gr.Row(): - asr_model = gr.Dropdown( - label=i18n("ASR 模型"), - choices=list(asr_dict.keys()), - interactive=True, - value="达摩 ASR (中文)", - ) - asr_size = gr.Dropdown( - label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large" - ) - asr_lang = gr.Dropdown( - label=i18n("ASR 语言设置"), choices=["zh", "yue"], interactive=True, value="zh" - ) - asr_precision = gr.Dropdown( - label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32" - ) - with gr.Row(): - asr_info = gr.Textbox(label=process_info(process_name_asr, "info")) - open_asr_button = gr.Button( - value=process_info(process_name_asr, "open"), variant="primary", visible=True - ) - close_asr_button = gr.Button( - value=process_info(process_name_asr, "close"), variant="primary", visible=False - ) + gr.Markdown(value="0c-" + i18n("语音识别工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + asr_inp_dir = gr.Textbox( + label=i18n("输入文件夹路径"), value="", interactive=True, + placeholder=i18n("请输入包含音频文件的文件夹路径"), + scale=6 + ) + asr_inp_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + with gr.Row(): + asr_opt_dir = gr.Textbox(label=i18n("输出文件夹路径"), value="output/asr_opt", interactive=True, scale=6) + asr_opt_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + # 按钮点击事件绑定 + asr_inp_btn.click(open_folder_dialog, [asr_inp_dir], [asr_inp_dir]) + asr_opt_btn.click(open_folder_dialog, [asr_opt_dir], [asr_opt_dir]) + with gr.Row(): + asr_model = gr.Dropdown( + label=i18n("ASR 模型"), + choices=list(asr_dict.keys()), + interactive=True, + value="达摩 ASR (中文)", + ) + asr_size = gr.Dropdown( + label=i18n("ASR 模型尺寸"), choices=["large"], interactive=True, value="large" + ) + asr_lang = gr.Dropdown( + label=i18n("ASR 语言设置"), choices=["zh", "yue"], interactive=True, value="zh" + ) + asr_precision = gr.Dropdown( + label=i18n("数据类型精度"), choices=["float32"], interactive=True, value="float32" + ) + with gr.Row(): + asr_info = gr.Textbox(label=process_info(process_name_asr, "info")) + open_asr_button = gr.Button( + value=process_info(process_name_asr, "open"), variant="primary", visible=True + ) + close_asr_button = gr.Button( + value=process_info(process_name_asr, "close"), variant="primary", visible=False + ) def change_lang_choices(key): # 根据选择的模型修改可选的语言 return {"__type__": "update", "choices": asr_dict[key]["lang"], "value": asr_dict[key]["lang"][0]} @@ -1453,186 +1644,177 @@ def change_precision_choices(key): # 根据选择的模型修改可选的语言 asr_model.change(change_size_choices, [asr_model], [asr_size]) asr_model.change(change_precision_choices, [asr_model], [asr_precision]) - with gr.Accordion(label="0d-" + i18n("语音文本校对标注工具")): + gr.Markdown(value="0d-" + i18n("语音文本校对标注工具")) + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + path_list = gr.Textbox( + label=i18n("标注文件路径 (含文件后缀 *.list)"), + value="", + interactive=True, + placeholder=i18n("请输入 .list 标注文件的完整路径"), + scale=6 + ) + path_list_btn = gr.Button(i18n("📄 选择文件"), scale=1) + # 按钮点击事件绑定 + path_list_btn.click(open_list_file_dialog, [path_list], [path_list]) + with gr.Row(): + label_info = gr.Textbox(label=process_info(process_name_subfix, "info")) + open_label = gr.Button(value=process_info(process_name_subfix, "open"), variant="primary", visible=True) + close_label = gr.Button( + value=process_info(process_name_subfix, "close"), variant="primary", visible=False + ) + + open_label.click(change_label, [path_list], [label_info, open_label, close_label]) + close_label.click(change_label, [path_list], [label_info, open_label, close_label]) + open_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + close_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + + with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): + with gr.Row(): with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - path_list = gr.Textbox( - label=i18n("标注文件路径 (含文件后缀 *.list)"), - value="D:\\RVC1006\\GPT-SoVITS\\raw\\xxx.list", - interactive=True, - ) - label_info = gr.Textbox(label=process_info(process_name_subfix, "info")) - open_label = gr.Button( - value=process_info(process_name_subfix, "open"), variant="primary", visible=True + exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) + gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) + version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"]) # , "v3" + with gr.Row(): + pretrained_s2G = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=9, ) - close_label = gr.Button( - value=process_info(process_name_subfix, "close"), variant="primary", visible=False + pretrained_s2D = gr.Textbox( + label=i18n("预训练SoVITS-D模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1].replace("s2G", "s2D"), + interactive=True, + lines=2, + max_lines=3, + scale=9, + ) + pretrained_s1 = gr.Textbox( + label=i18n("预训练GPT模型路径"), + value=pretrained_gpt_name[int(version[-1]) - 1], + interactive=True, + lines=2, + max_lines=3, + scale=10, ) - open_label.click(change_label, [path_list], [label_info, open_label, close_label]) - close_label.click(change_label, [path_list], [label_info, open_label, close_label]) - open_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) - close_uvr5.click(change_uvr5, [], [uvr5_info, open_uvr5, close_uvr5]) + with gr.TabItem("1A-" + i18n("训练集格式化工具")): + gr.Markdown(value=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")) + with gr.Row(): + with gr.Row(): + inp_text = gr.Textbox( + label=i18n("*文本标注文件"), + value="", + interactive=True, + scale=6, + placeholder=i18n("请输入 .list 标注文件的完整路径") + ) + inp_text_btn = gr.Button(i18n("📄 选择文件"), scale=1) + with gr.Row(): + inp_wav_dir = gr.Textbox( + label=i18n("*训练集音频文件目录"), + # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", + interactive=True, + placeholder=i18n( + "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。" + ), + scale=6, + ) + inp_wav_dir_btn = gr.Button(i18n("📁 选择文件夹"), scale=1) + # 按钮点击事件绑定 + inp_text_btn.click(open_list_file_dialog, [inp_text], [inp_text]) + inp_wav_dir_btn.click(open_folder_dialog, [inp_wav_dir], [inp_wav_dir]) - with gr.TabItem(i18n("1-GPT-SoVITS-TTS")): - with gr.Accordion(i18n("微调模型信息")): + gr.Markdown(value="1Aa-" + process_name_1a) with gr.Row(): - with gr.Row(equal_height=True): - exp_name = gr.Textbox( - label=i18n("*实验/模型名"), - value="xxx", + with gr.Row(): + gpu_numbers1a = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), interactive=True, - scale=3, ) - gpu_info_box = gr.Textbox( - label=i18n("显卡信息"), - value=gpu_info, - visible=True, + with gr.Row(): + bert_pretrained_dir = gr.Textbox( + label=i18n("预训练中文BERT模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", interactive=False, - scale=5, + lines=2, ) - version_checkbox = gr.Radio( - label=i18n("训练模型的版本"), - value=version, - choices=["v1", "v2", "v4", "v2Pro", "v2ProPlus"], - scale=5, + with gr.Row(): + button1a_open = gr.Button( + value=process_info(process_name_1a, "open"), variant="primary", visible=True ) - with gr.Accordion(label=i18n("预训练模型路径"), open=False): + button1a_close = gr.Button( + value=process_info(process_name_1a, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1a = gr.Textbox(label=process_info(process_name_1a, "info")) + + gr.Markdown(value="1Ab-" + process_name_1b) with gr.Row(): - with gr.Row(equal_height=True): - pretrained_s1 = gr.Textbox( - label=i18n("预训练GPT模型路径"), - value=pretrained_gpt_name[version], + with gr.Row(): + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), interactive=True, - lines=1, - max_lines=1, - scale=3, ) - pretrained_s2G = gr.Textbox( - label=i18n("预训练SoVITS-G模型路径"), - value=pretrained_sovits_name[version], - interactive=True, - lines=1, - max_lines=1, - scale=5, + with gr.Row(): + cnhubert_base_dir = gr.Textbox( + label=i18n("预训练SSL模型路径"), + value="GPT_SoVITS/pretrained_models/chinese-hubert-base", + interactive=False, + lines=2, ) - pretrained_s2D = gr.Textbox( - label=i18n("预训练SoVITS-D模型路径"), - value=pretrained_sovits_name[version].replace("s2G", "s2D"), - interactive=True, - lines=1, - max_lines=1, - scale=5, + with gr.Row(): + button1b_open = gr.Button( + value=process_info(process_name_1b, "open"), variant="primary", visible=True + ) + button1b_close = gr.Button( + value=process_info(process_name_1b, "close"), variant="primary", visible=False ) - - with gr.TabItem("1A-" + i18n("训练集格式化工具")): - with gr.Accordion(label=i18n("输出logs/实验名目录下应有23456开头的文件和文件夹")): with gr.Row(): - with gr.Row(): - inp_text = gr.Textbox( - label=i18n("*文本标注文件"), - value=r"D:\RVC1006\GPT-SoVITS\raw\xxx.list", - interactive=True, - scale=10, - ) - with gr.Row(): - inp_wav_dir = gr.Textbox( - label=i18n("*训练集音频文件目录"), - # value=r"D:\RVC1006\GPT-SoVITS\raw\xxx", - interactive=True, - placeholder=i18n( - "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。" - ), - scale=10, - ) + info1b = gr.Textbox(label=process_info(process_name_1b, "info")) - with gr.Accordion(label="1Aa-" + process_name_1a): + gr.Markdown(value="1Ac-" + process_name_1c) + with gr.Row(): with gr.Row(): - with gr.Row(): - gpu_numbers1a = gr.Textbox( - label=i18n("GPU卡号以-分割,每个卡号一个进程"), - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - with gr.Row(): - bert_pretrained_dir = gr.Textbox( - label=i18n("预训练中文BERT模型路径"), - value="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", - interactive=False, - lines=2, - ) - with gr.Row(): - button1a_open = gr.Button( - value=process_info(process_name_1a, "open"), variant="primary", visible=True - ) - button1a_close = gr.Button( - value=process_info(process_name_1a, "close"), variant="primary", visible=False - ) - with gr.Row(): - info1a = gr.Textbox(label=process_info(process_name_1a, "info")) - - with gr.Accordion(label="1Ab-" + process_name_1b): + gpu_numbers1c = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), + value="%s-%s" % (gpus, gpus), + interactive=True, + ) with gr.Row(): - with gr.Row(): - gpu_numbers1Ba = gr.Textbox( - label=i18n("GPU卡号以-分割,每个卡号一个进程"), - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - with gr.Row(): - cnhubert_base_dir = gr.Textbox( - label=i18n("预训练SSL模型路径"), - value="GPT_SoVITS/pretrained_models/chinese-hubert-base", - interactive=False, - lines=2, - ) - with gr.Row(): - button1b_open = gr.Button( - value=process_info(process_name_1b, "open"), variant="primary", visible=True - ) - button1b_close = gr.Button( - value=process_info(process_name_1b, "close"), variant="primary", visible=False - ) - with gr.Row(): - info1b = gr.Textbox(label=process_info(process_name_1b, "info")) - - with gr.Accordion(label="1Ac-" + process_name_1c): + pretrained_s2G_ = gr.Textbox( + label=i18n("预训练SoVITS-G模型路径"), + value=pretrained_sovits_name[int(version[-1]) - 1], + interactive=False, + lines=2, + ) with gr.Row(): - with gr.Row(): - gpu_numbers1c = gr.Textbox( - label=i18n("GPU卡号以-分割,每个卡号一个进程"), - value="%s-%s" % (gpus, gpus), - interactive=True, - ) - with gr.Row(): - pretrained_s2G_ = gr.Textbox( - label=i18n("预训练SoVITS-G模型路径"), - value=pretrained_sovits_name[version], - interactive=False, - lines=2, - ) - with gr.Row(): - button1c_open = gr.Button( - value=process_info(process_name_1c, "open"), variant="primary", visible=True - ) - button1c_close = gr.Button( - value=process_info(process_name_1c, "close"), variant="primary", visible=False - ) - with gr.Row(): - info1c = gr.Textbox(label=process_info(process_name_1c, "info")) + button1c_open = gr.Button( + value=process_info(process_name_1c, "open"), variant="primary", visible=True + ) + button1c_close = gr.Button( + value=process_info(process_name_1c, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1c = gr.Textbox(label=process_info(process_name_1c, "info")) - with gr.Accordion(label="1Aabc-" + process_name_1abc): + gr.Markdown(value="1Aabc-" + process_name_1abc) + with gr.Row(): with gr.Row(): - with gr.Row(): - button1abc_open = gr.Button( - value=process_info(process_name_1abc, "open"), variant="primary", visible=True - ) - button1abc_close = gr.Button( - value=process_info(process_name_1abc, "close"), variant="primary", visible=False - ) - with gr.Row(): - info1abc = gr.Textbox(label=process_info(process_name_1abc, "info")) + button1abc_open = gr.Button( + value=process_info(process_name_1abc, "open"), variant="primary", visible=True + ) + button1abc_close = gr.Button( + value=process_info(process_name_1abc, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1abc = gr.Textbox(label=process_info(process_name_1abc, "info")) pretrained_s2G.change(sync, [pretrained_s2G], [pretrained_s2G_]) open_asr_button.click( @@ -1673,20 +1855,17 @@ def change_precision_choices(key): # 根据选择的模型修改可选的语言 button1a_close.click(close1a, [], [info1a, button1a_open, button1a_close]) button1b_open.click( open1b, - [version_checkbox, inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], + [inp_text, inp_wav_dir, exp_name, gpu_numbers1Ba, cnhubert_base_dir], [info1b, button1b_open, button1b_close], ) button1b_close.click(close1b, [], [info1b, button1b_open, button1b_close]) button1c_open.click( - open1c, - [version_checkbox, inp_text, inp_wav_dir, exp_name, gpu_numbers1c, pretrained_s2G], - [info1c, button1c_open, button1c_close], + open1c, [inp_text, exp_name, gpu_numbers1c, pretrained_s2G], [info1c, button1c_open, button1c_close] ) button1c_close.click(close1c, [], [info1c, button1c_open, button1c_close]) button1abc_open.click( open1abc, [ - version_checkbox, inp_text, inp_wav_dir, exp_name, @@ -1702,151 +1881,147 @@ def change_precision_choices(key): # 根据选择的模型修改可选的语言 button1abc_close.click(close1abc, [], [info1abc, button1abc_open, button1abc_close]) with gr.TabItem("1B-" + i18n("微调训练")): - with gr.Accordion(label="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")): - with gr.Row(): - with gr.Column(): - with gr.Row(): - batch_size = gr.Slider( - minimum=1, - maximum=default_max_batch_size, - step=1, - label=i18n("每张显卡的batch_size"), - value=default_batch_size, - interactive=True, - ) - total_epoch = gr.Slider( - minimum=1, - maximum=max_sovits_epoch, - step=1, - label=i18n("总训练轮数total_epoch,不建议太高"), - value=default_sovits_epoch, - interactive=True, - ) - with gr.Row(): - text_low_lr_rate = gr.Slider( - minimum=0.2, - maximum=0.6, - step=0.05, - label=i18n("文本模块学习率权重"), - value=0.4, - visible=True if version not in v3v4set else False, - ) # v3v4 not need - lora_rank = gr.Radio( - label=i18n("LoRA秩"), - value="32", - choices=["16", "32", "64", "128"], - visible=True if version in v3v4set else False, - ) # v1v2 not need - save_every_epoch = gr.Slider( - minimum=1, - maximum=max_sovits_save_every_epoch, - step=1, - label=i18n("保存频率save_every_epoch"), - value=default_sovits_save_every_epoch, - interactive=True, - ) - with gr.Column(): - with gr.Column(): - if_save_latest = gr.Checkbox( - label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), - value=True, - interactive=True, - show_label=True, - ) - if_save_every_weights = gr.Checkbox( - label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), - value=True, - interactive=True, - show_label=True, - ) - if_grad_ckpt = gr.Checkbox( - label="v3是否开启梯度检查点节省显存占用", - value=False, - interactive=True if version in v3v4set else False, - show_label=True, - visible=False, - ) # 只有V3s2可以用 - with gr.Row(): - gpu_numbers1Ba = gr.Textbox( - label=i18n("GPU卡号以-分割,每个卡号一个进程"), - value="%s" % (gpus), - interactive=True, - ) - with gr.Row(): + gr.Markdown(value="1Ba-" + i18n("SoVITS 训练: 模型权重文件在 SoVITS_weights/")) + with gr.Row(): + with gr.Column(): with gr.Row(): - button1Ba_open = gr.Button( - value=process_info(process_name_sovits, "open"), variant="primary", visible=True + batch_size = gr.Slider( + minimum=1, + maximum=default_max_batch_size, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size, + interactive=True, ) - button1Ba_close = gr.Button( - value=process_info(process_name_sovits, "close"), variant="primary", visible=False + total_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_epoch, + step=1, + label=i18n("总训练轮数total_epoch,不建议太高"), + value=default_sovits_epoch, + interactive=True, ) with gr.Row(): - info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info")) - with gr.Accordion(label="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")): - with gr.Row(): - with gr.Column(): - with gr.Row(): - batch_size1Bb = gr.Slider( - minimum=1, - maximum=40, - step=1, - label=i18n("每张显卡的batch_size"), - value=default_batch_size_s1, - interactive=True, - ) - total_epoch1Bb = gr.Slider( - minimum=2, - maximum=50, - step=1, - label=i18n("总训练轮数total_epoch"), - value=15, - interactive=True, - ) - with gr.Row(): - save_every_epoch1Bb = gr.Slider( - minimum=1, - maximum=50, - step=1, - label=i18n("保存频率save_every_epoch"), - value=5, - interactive=True, - ) - if_dpo = gr.Checkbox( - label=i18n("是否开启DPO训练选项(实验性)"), - value=False, - interactive=True, - show_label=True, - ) + text_low_lr_rate = gr.Slider( + minimum=0.2, + maximum=0.6, + step=0.05, + label=i18n("文本模块学习率权重"), + value=0.4, + visible=True if version not in v3v4set else False, + ) # v3v4 not need + lora_rank = gr.Radio( + label=i18n("LoRA秩"), + value="32", + choices=["16", "32", "64", "128"], + visible=True if version in v3v4set else False, + ) # v1v2 not need + save_every_epoch = gr.Slider( + minimum=1, + maximum=max_sovits_save_every_epoch, + step=1, + label=i18n("保存频率save_every_epoch"), + value=default_sovits_save_every_epoch, + interactive=True, + ) + with gr.Column(): with gr.Column(): - with gr.Column(): - if_save_latest1Bb = gr.Checkbox( - label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), - value=True, - interactive=True, - show_label=True, - ) - if_save_every_weights1Bb = gr.Checkbox( - label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), - value=True, - interactive=True, - show_label=True, - ) - with gr.Row(): - gpu_numbers1Bb = gr.Textbox( - label=i18n("GPU卡号以-分割,每个卡号一个进程"), - value="%s" % (gpus), - interactive=True, - ) + if_save_latest = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, + ) + if_save_every_weights = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, + ) + if_grad_ckpt = gr.Checkbox( + label="v3是否开启梯度检查点节省显存占用", + value=False, + interactive=True if version in v3v4set else False, + show_label=True, + visible=False, + ) # 只有V3s2可以用 + with gr.Row(): + gpu_numbers1Ba = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) + with gr.Row(): + with gr.Row(): + button1Ba_open = gr.Button( + value=process_info(process_name_sovits, "open"), variant="primary", visible=True + ) + button1Ba_close = gr.Button( + value=process_info(process_name_sovits, "close"), variant="primary", visible=False + ) with gr.Row(): + info1Ba = gr.Textbox(label=process_info(process_name_sovits, "info")) + gr.Markdown(value="1Bb-" + i18n("GPT 训练: 模型权重文件在 GPT_weights/")) + with gr.Row(): + with gr.Column(): with gr.Row(): - button1Bb_open = gr.Button( - value=process_info(process_name_gpt, "open"), variant="primary", visible=True + batch_size1Bb = gr.Slider( + minimum=1, + maximum=40, + step=1, + label=i18n("每张显卡的batch_size"), + value=default_batch_size_s1, + interactive=True, + ) + total_epoch1Bb = gr.Slider( + minimum=2, + maximum=50, + step=1, + label=i18n("总训练轮数total_epoch"), + value=15, + interactive=True, + ) + with gr.Row(): + save_every_epoch1Bb = gr.Slider( + minimum=1, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + if_dpo = gr.Checkbox( + label=i18n("是否开启DPO训练选项(实验性)"), + value=False, + interactive=True, + show_label=True, + ) + with gr.Column(): + with gr.Column(): + if_save_latest1Bb = gr.Checkbox( + label=i18n("是否仅保存最新的权重文件以节省硬盘空间"), + value=True, + interactive=True, + show_label=True, ) - button1Bb_close = gr.Button( - value=process_info(process_name_gpt, "close"), variant="primary", visible=False + if_save_every_weights1Bb = gr.Checkbox( + label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"), + value=True, + interactive=True, + show_label=True, ) with gr.Row(): - info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info")) + gpu_numbers1Bb = gr.Textbox( + label=i18n("GPU卡号以-分割,每个卡号一个进程"), value="%s" % (gpus), interactive=True + ) + with gr.Row(): + with gr.Row(): + button1Bb_open = gr.Button( + value=process_info(process_name_gpt, "open"), variant="primary", visible=True + ) + button1Bb_close = gr.Button( + value=process_info(process_name_gpt, "close"), variant="primary", visible=False + ) + with gr.Row(): + info1Bb = gr.Textbox(label=process_info(process_name_gpt, "info")) button1Ba_close.click(close1Ba, [], [info1Ba, button1Ba_open, button1Ba_close]) button1Bb_close.click(close1Bb, [], [info1Bb, button1Bb_open, button1Bb_close]) @@ -1854,44 +2029,41 @@ def change_precision_choices(key): # 根据选择的模型修改可选的语言 with gr.TabItem("1C-" + i18n("推理")): gr.Markdown( value=i18n( - "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的几个是底模,体验5秒Zero Shot TTS不训练推理用。" + "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。" ) ) with gr.Row(): - with gr.Column(scale=2): - with gr.Row(): - GPT_dropdown = gr.Dropdown( - label=i18n("GPT模型列表"), - choices=GPT_names, - value=GPT_names[-1], - interactive=True, - ) - SoVITS_dropdown = gr.Dropdown( - label=i18n("SoVITS模型列表"), - choices=SoVITS_names, - value=SoVITS_names[0], - interactive=True, - ) - with gr.Column(scale=2): - with gr.Row(): - gpu_number_1C = gr.Textbox( - label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True - ) - refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=pretrained_gpt_name[0], + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=pretrained_sovits_name[0], + interactive=True, + ) + with gr.Row(): + gpu_number_1C = gr.Textbox(label=i18n("GPU卡号,只能填1个整数"), value=gpus, interactive=True) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - with gr.Row(equal_height=True): + with gr.Row(): with gr.Row(): batched_infer_enabled = gr.Checkbox( label=i18n("启用并行推理版本"), value=False, interactive=True, show_label=True ) + with gr.Row(): open_tts = gr.Button( value=process_info(process_name_tts, "open"), variant="primary", visible=True ) close_tts = gr.Button( value=process_info(process_name_tts, "close"), variant="primary", visible=False ) - with gr.Column(): - tts_info = gr.Textbox(label=process_info(process_name_tts, "info"), scale=2) + with gr.Row(): + tts_info = gr.Textbox(label=process_info(process_name_tts, "info")) open_tts.click( change_tts_inference, [ @@ -1919,7 +2091,6 @@ def change_precision_choices(key): # 根据选择的模型修改可选的语言 button1Ba_open.click( open1Ba, [ - version_checkbox, batch_size, total_epoch, exp_name,