diff --git a/GPT_SoVITS/AR/data/bucket_sampler.py b/GPT_SoVITS/AR/data/bucket_sampler.py
index d84573340..c2db920e1 100644
--- a/GPT_SoVITS/AR/data/bucket_sampler.py
+++ b/GPT_SoVITS/AR/data/bucket_sampler.py
@@ -39,12 +39,12 @@ def __init__(
         if num_replicas is None:
             if not dist.is_available():
                 raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
+            num_replicas = dist.get_world_size() if dist.is_initialized() else 1
         if rank is None:
             if not dist.is_available():
                 raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank() if torch.cuda.is_available() else 0
-            if torch.cuda.is_available():
+            rank = dist.get_rank() if dist.is_initialized() else 0
+            if torch.cuda.is_available() and dist.is_initialized():
                 torch.cuda.set_device(rank)
         if rank >= num_replicas or rank < 0:
             raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1))
diff --git a/GPT_SoVITS/s1_train.py b/GPT_SoVITS/s1_train.py
index 1176f0bce..61f42f641 100644
--- a/GPT_SoVITS/s1_train.py
+++ b/GPT_SoVITS/s1_train.py
@@ -118,7 +118,7 @@ def main(args):
         benchmark=False,
         fast_dev_run=False,
         strategy=DDPStrategy(process_group_backend="nccl" if platform.system() != "Windows" else "gloo")
-        if torch.cuda.is_available()
+        if torch.cuda.is_available() and torch.cuda.device_count() > 1
         else "auto",
         precision=config["train"]["precision"],
         logger=logger,
diff --git a/GPT_SoVITS/s2_train.py b/GPT_SoVITS/s2_train.py
index 333e6a05a..a1275269a 100644
--- a/GPT_SoVITS/s2_train.py
+++ b/GPT_SoVITS/s2_train.py
@@ -77,12 +77,13 @@ def run(rank, n_gpus, hps):
         writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
         writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
 
-    dist.init_process_group(
-        backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-        init_method="env://?use_libuv=False",
-        world_size=n_gpus,
-        rank=rank,
-    )
+    if not (os.name == "nt" and n_gpus == 1):
+        dist.init_process_group(
+            backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+            init_method="env://?use_libuv=False",
+            world_size=n_gpus,
+            rank=rank,
+        )
     torch.manual_seed(hps.train.seed)
     if torch.cuda.is_available():
         torch.cuda.set_device(rank)
@@ -197,8 +198,18 @@ def run(rank, n_gpus, hps):
         eps=hps.train.eps,
     )
     if torch.cuda.is_available():
-        net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
-        net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+        if os.name == "nt" and n_gpus == 1:
+            class DummyDDP(torch.nn.Module):
+                def __init__(self, module):
+                    super().__init__()
+                    self.module = module
+                def forward(self, *args, **kwargs):
+                    return self.module(*args, **kwargs)
+            net_g = DummyDDP(net_g)
+            net_d = DummyDDP(net_d)
+        else:
+            net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
+            net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
     else:
         net_g = net_g.to(device)
         net_d = net_d.to(device)
diff --git a/GPT_SoVITS/s2_train_v3.py b/GPT_SoVITS/s2_train_v3.py
index bcde98a81..6d09c6e80 100644
--- a/GPT_SoVITS/s2_train_v3.py
+++ b/GPT_SoVITS/s2_train_v3.py
@@ -77,12 +77,13 @@ def run(rank, n_gpus, hps):
         writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
         writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
 
-    dist.init_process_group(
-        backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-        init_method="env://?use_libuv=False",
-        world_size=n_gpus,
-        rank=rank,
-    )
+    if not (os.name == "nt" and n_gpus == 1):
+        dist.init_process_group(
+            backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+            init_method="env://?use_libuv=False",
+            world_size=n_gpus,
+            rank=rank,
+        )
     torch.manual_seed(hps.train.seed)
     if torch.cuda.is_available():
         torch.cuda.set_device(rank)
@@ -166,8 +167,18 @@ def run(rank, n_gpus, hps):
     #     eps=hps.train.eps,
     # )
     if torch.cuda.is_available():
-        net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
-        # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+        if os.name == "nt" and n_gpus == 1:
+            class DummyDDP(torch.nn.Module):
+                def __init__(self, module):
+                    super().__init__()
+                    self.module = module
+                def forward(self, *args, **kwargs):
+                    return self.module(*args, **kwargs)
+            net_g = DummyDDP(net_g)
+            # net_d = DummyDDP(net_d)
+        else:
+            net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
+            # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
     else:
         net_g = net_g.to(device)
         # net_d = net_d.to(device)
diff --git a/GPT_SoVITS/s2_train_v3_lora.py b/GPT_SoVITS/s2_train_v3_lora.py
index ff62ccfe1..40a1ad06f 100644
--- a/GPT_SoVITS/s2_train_v3_lora.py
+++ b/GPT_SoVITS/s2_train_v3_lora.py
@@ -77,12 +77,13 @@ def run(rank, n_gpus, hps):
         writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
         writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
 
-    dist.init_process_group(
-        backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-        init_method="env://?use_libuv=False",
-        world_size=n_gpus,
-        rank=rank,
-    )
+    if not (os.name == "nt" and n_gpus == 1):
+        dist.init_process_group(
+            backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+            init_method="env://?use_libuv=False",
+            world_size=n_gpus,
+            rank=rank,
+        )
     torch.manual_seed(hps.train.seed)
     if torch.cuda.is_available():
         torch.cuda.set_device(rank)
@@ -156,7 +157,16 @@ def get_optim(net_g):
 
     def model2cuda(net_g, rank):
         if torch.cuda.is_available():
-            net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
+            if os.name == "nt" and n_gpus == 1:
+                class DummyDDP(torch.nn.Module):
+                    def __init__(self, module):
+                        super().__init__()
+                        self.module = module
+                    def forward(self, *args, **kwargs):
+                        return self.module(*args, **kwargs)
+                net_g = DummyDDP(net_g.cuda(rank))
+            else:
+                net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
         else:
             net_g = net_g.to(device)
         return net_g
diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py
index 08e183842..93ed7ada9 100644
--- a/GPT_SoVITS/utils.py
+++ b/GPT_SoVITS/utils.py
@@ -64,12 +64,25 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
 from time import time as ttime
 
 
+import time
+
 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
     dir = os.path.dirname(path)
     name = os.path.basename(path)
-    tmp_path = "%s.pth" % (ttime())
+    tmp_path = "%s.pth" % (time.time())
     torch.save(fea, tmp_path)
-    shutil.move(tmp_path, "%s/%s" % (dir, name))
+    target_path = "%s/%s" % (dir, name)
+    try:
+        shutil.move(tmp_path, target_path)
+    except Exception as e:
+        print(f"Move failed with error {e}, retrying via copy and delete...")
+        if os.path.exists(target_path):
+            try:
+                os.remove(target_path)
+            except:
+                pass
+        shutil.copyfile(tmp_path, target_path)
+        os.remove(tmp_path)
 
 
 def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):