Spaces:

alibabasglab
/

EEYD

Running on Zero

App Files Files Community

alibabasglab commited on 4 days ago

Commit

27b0df9

verified ·

1 Parent(s): 83a0f63

Update networks.py

Browse files

Files changed (1) hide show

networks.py +42 -64

networks.py CHANGED Viewed

@@ -53,7 +53,7 @@ class SpeechModel:
     def load_model(self):
-        checkpoint_path = hf_hub_download(repo_id=f"alibabasglab/{self.args.model_name}", filename="last_best_checkpoint.pt")
         # Load the checkpoint file into memory (map_location ensures compatibility with different devices)
         checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
@@ -173,81 +173,59 @@ class select_network(nn.Module):
         super(select_network, self).__init__()
         self.args = args
-        # audio backbone network
-        if args.model_name == 'EEYD_mrx':
-            from models.mrx.mrx import MRX
-            self.sep_network = MRX(args)
-        elif args.model_name in ['EEYD_demucs']:
-            from models.eeyd.eeyd import eeyd
-            self.sep_network = eeyd(args)
-        elif args.model_name in ['EEYD_locoformer']:
-            from models.tflocoformer.tflocoformer_separator import TFLocoformer
-            self.sep_network = TFLocoformer(args)
-        else:
-            raise NameError('Wrong network selection')
         print(f'{args.model_name} running.')
-        if self.args.network_reference.text_network == 't5':
-            import os
-            from transformers import AutoTokenizer, T5EncoderModel
-            model_path = snapshot_download(repo_id="alibabasglab/t5-base")
-            model_path = os.path.join(model_path, "t5-base")
-            # model_path = hf_hub_download(repo_id="alibabasglab/t5-base", filename="t5-base")
-            self.tokenizer =AutoTokenizer.from_pretrained(model_path, model_max_length=512)
-            self.text_encoder = T5EncoderModel.from_pretrained(model_path)
-            # os.environ["TOKENIZERS_PARALLELISM"] = "false"
-            for param in self.text_encoder.parameters():
-                param.requires_grad = False
-        else:
-            raise NameError('Wrong text network selection')
-        if self.args.network_audio.backbone in ['eeyd','tflocoformer']:
-            if self.args.network_audio.add_feature in ['beats']:
-                from models.beats.BEATs import BEATs, BEATsConfig
-                model_path = snapshot_download(repo_id="alibabasglab/beats")
-                model_path = os.path.join(model_path, "BEATs_iter3_plus_AS2M.pt")
-                checkpoint = torch.load(model_path)
-                cfg = BEATsConfig(checkpoint['cfg'])
-                self.BEATs_model = BEATs(cfg)
-                self.BEATs_model.load_state_dict(checkpoint['model'])
-                self.BEATs_model.eval()
-                for param in self.BEATs_model.parameters():
-                    param.requires_grad = False
     def forward(self, mixture, t_ref, device):
         mixture = torch.tensor(mixture).to(device)
         mixture = mixture.unsqueeze(0)
-        if self.args.network_reference.text_network == 't5':
-            text_input = self.tokenizer(t_ref, return_tensors="pt", truncation=True, padding="longest")
-            text_input_ids = text_input["input_ids"].to(device)
-            text_attention_mask = text_input["attention_mask"].to(device)
-            text_len = torch.sum(text_attention_mask, dim=1)
-            text_embedding = self.text_encoder(input_ids=text_input_ids, attention_mask=text_attention_mask).last_hidden_state
-            t_ref = (text_embedding.clone().detach(), text_attention_mask.clone().detach(), text_len.clone().detach())
-        else: # clap series
-            text_embedding = self.text_encoder.get_text_embedding(t_ref, use_tensor=True)
-            text_embedding = text_embedding.clone().detach()
-            text_attention_mask = torch.ones((text_embedding.shape[0],1), dtype=torch.int32)
-            text_len = torch.ones((text_embedding.shape[0]), dtype=torch.int32)
-            text_embedding = self.clap_us(text_embedding.unsqueeze(1))
-            t_ref = (text_embedding, text_attention_mask.to(device), text_len)
-        if self.args.network_audio.backbone in ['eeyd','tflocoformer']:
-            if self.args.network_audio.add_feature in ['beats']:
-                with torch.no_grad():
-                    padding_mask = torch.zeros_like(mixture).bool()
-                    a_ref = self.BEATs_model.extract_features(mixture, padding_mask=padding_mask)[0]
-                    a_ref = a_ref.transpose(1,2)
-                return self.forword_step(mixture, t_ref, a_ref.clone().detach())
-        return self.sep_network(mixture, t_ref)
     def forword_step(self, mixture, t_ref, a_ref):

     def load_model(self):
+        checkpoint_path = hf_hub_download(repo_id=f"alibabasglab/{self.args.model_name}", filename="last_checkpoint.pt")
         # Load the checkpoint file into memory (map_location ensures compatibility with different devices)
         checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
         super(select_network, self).__init__()
         self.args = args
+        from models.tflocoformer.tflocoformer_separator import TFLocoformer
+        self.sep_network = TFLocoformer(args)
         print(f'{args.model_name} running.')
+        import os
+        from transformers import AutoTokenizer, T5EncoderModel
+        model_path = snapshot_download(repo_id="alibabasglab/t5-base")
+        model_path = os.path.join(model_path, "t5-base")
+        # model_path = hf_hub_download(repo_id="alibabasglab/t5-base", filename="t5-base")
+        self.tokenizer =AutoTokenizer.from_pretrained(model_path, model_max_length=512)
+        self.text_encoder = T5EncoderModel.from_pretrained(model_path)
+        # os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        for param in self.text_encoder.parameters():
+            param.requires_grad = False
+        from models.beats.BEATs import BEATs, BEATsConfig
+        model_path = snapshot_download(repo_id="alibabasglab/beats")
+        model_path = os.path.join(model_path, "BEATs_iter3_plus_AS2M.pt")
+        checkpoint = torch.load(model_path)
+        cfg = BEATsConfig(checkpoint['cfg'])
+        self.BEATs_model = BEATs(cfg)
+        self.BEATs_model.load_state_dict(checkpoint['model'])
+        self.BEATs_model.eval()
+        for param in self.BEATs_model.parameters():
+            param.requires_grad = False
     def forward(self, mixture, t_ref, device):
         mixture = torch.tensor(mixture).to(device)
         mixture = mixture.unsqueeze(0)
+        text_input = self.tokenizer(t_ref, return_tensors="pt", truncation=True, padding="longest")
+        text_input_ids = text_input["input_ids"].to(device)
+        text_attention_mask = text_input["attention_mask"].to(device)
+        text_len = torch.sum(text_attention_mask, dim=1)
+        text_embedding = self.text_encoder(input_ids=text_input_ids, attention_mask=text_attention_mask).last_hidden_state
+        t_ref = (text_embedding.clone().detach(), text_attention_mask.clone().detach(), text_len.clone().detach())
+        with torch.no_grad():
+            padding_mask = torch.zeros_like(mixture).bool()
+            a_ref = self.BEATs_model.extract_features(mixture, padding_mask=padding_mask)[0]
+            a_ref = a_ref.transpose(1,2)
+        return self.forword_step(mixture, t_ref, a_ref.clone().detach())
     def forword_step(self, mixture, t_ref, a_ref):