Spaces:

alibabasglab
/

EEYD

Running on Zero

App Files Files Community

EEYD / networks.py

alibabasglab

Update networks.py

27b0df9 verified 5 days ago

raw

history blame contribute delete

9.73 kB

	"""
	Authors: Zexu Pan, Shengkui Zhao
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.cuda.amp import autocast, GradScaler
	import soundfile as sf
	import librosa
	import tempfile
	import os
	import subprocess
	from tqdm import tqdm
	from huggingface_hub import hf_hub_download
	from huggingface_hub import snapshot_download
	import numpy as np
	import ffmpeg


	class SpeechModel:
	def __init__(self, args):
	if torch.cuda.is_available():
	print('GPU is found and used!')
	self.device = torch.device('cuda')
	else:
	# If no GPU is detected, use the CPU
	args.use_cuda = 0
	self.device = torch.device('cpu')

	self.args = args
	self.model = None
	self.name = None
	self.data = {}

	def get_free_gpu(self):
	try:
	# Run nvidia-smi to query GPU memory usage and free memory
	result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.free', '--format=csv,nounits,noheader'], stdout=subprocess.PIPE)
	gpu_info = result.stdout.decode('utf-8').strip().split('\n')

	free_gpu = None
	max_free_memory = 0
	for i, info in enumerate(gpu_info):
	used, free = map(int, info.split(','))
	if free > max_free_memory:
	max_free_memory = free
	free_gpu = i
	return free_gpu
	except Exception as e:
	print(f"Error finding free GPU: {e}")
	return None

	def load_model(self):

	checkpoint_path = hf_hub_download(repo_id=f"alibabasglab/{self.args.model_name}", filename="last_checkpoint.pt")

	# Load the checkpoint file into memory (map_location ensures compatibility with different devices)
	checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)

	# Load the model's state dictionary (weights and biases) into the current model
	if 'model' in checkpoint:
	pretrained_model = checkpoint['model']
	else:
	pretrained_model = checkpoint
	state = self.model.state_dict()
	for key in state.keys():
	if key in pretrained_model and state[key].shape == pretrained_model[key].shape:
	state[key] = pretrained_model[key]
	elif key.replace('module.', '') in pretrained_model and state[key].shape == pretrained_model[key.replace('module.', '')].shape:
	state[key] = pretrained_model[key.replace('module.', '')]
	elif 'module.'+key in pretrained_model and state[key].shape == pretrained_model['module.'+key].shape:
	state[key] = pretrained_model['module.'+key]
	else: raise NameError(f'{key} not loaded')
	self.model.load_state_dict(state)
	print(f'Successfully loaded model weights for decoding')


	def process(self, file_path, text):
	orig_audio = self.load_data(file_path)
	text = [text]
	with torch.no_grad():
	chunk_size = 160000 # 240000
	print(orig_audio.shape)
	if orig_audio.shape[0] > chunk_size:
	output_audio = torch.zeros(1,orig_audio.shape[0])
	for itr in range(0, orig_audio.shape[0]//chunk_size):
	output_audio[:,chunk_sizeitr:chunk_size(itr+1)] = self.model(orig_audio[chunk_sizeitr:chunk_size(itr+1)], text, self.device)
	output_audio[:,-chunk_size:] = self.model(orig_audio[-chunk_size:], text, self.device)
	else:
	output_audio = self.model(orig_audio, text, self.device)
	output_audio = output_audio.detach().squeeze().cpu().numpy()
	# residual_audio = residual_audio.detach().squeeze().cpu().numpy()
	residual_audio = orig_audio - output_audio
	return orig_audio, output_audio, residual_audio

	def _audioread(self, path, sampling_rate):
	data, fs = sf.read(path, dtype='float32')

	if len(data.shape) >1:
	if data.shape[0] > data.shape[1]:
	data = data[:, 0]
	else:
	data = data[0, :]
	if fs != sampling_rate:
	data = librosa.resample(data, orig_sr=fs, target_sr=sampling_rate)

	max_val = np.max(np.abs(data))
	if max_val > 1:
	data /= max_val
	return data

	def _videoread(self, path, sampling_rate):
	try:
	# Use ffmpeg to extract audio and output raw PCM data
	process = (
	ffmpeg
	.input(path)
	.output('pipe:', format='wav', ar=sampling_rate, ac=1)
	.run(capture_stdout=True, capture_stderr=True)
	)

	# Read the audio data from the raw output
	audio_data = np.frombuffer(process[0], dtype=np.int16)

	# Normalize to [-1, 1] if needed (optional, depending on your use case)
	audio_data = audio_data.astype(np.float32) / 32768.0

	max_val = np.max(np.abs(audio_data))
	if max_val > 1:
	audio_data /= max_val

	return audio_data

	except ffmpeg.Error as e:
	print(f"Error loading audio from video: {e.stderr.decode()}")
	return None, None

	def load_data(self, file_path):
	"""
	Detect whether the file is audio or video, then process it.
	- Audio: Load using `soundfile`.
	- Video: Extract audio and resample to 16 kHz.
	"""
	# Check if the file exists
	if not os.path.isfile(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")

	# Supported audio and video extensions
	audio_extensions = ['.wav', '.flac', '.mp3', '.ogg', '.mat']
	video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.webm']

	_, ext = os.path.splitext(file_path)
	ext = ext.lower()

	if ext in audio_extensions:
	# Handle audio files
	print(f"Processing audio file: {file_path}")
	data = self._audioread(file_path, self.args.sampling_rate)
	return data

	elif ext in video_extensions:
	# Handle video files
	print(f"Processing video file: {file_path}")
	data = self._videoread(file_path, self.args.sampling_rate)
	return data
	else:
	raise ValueError(f"Unsupported file type: {file_path}")


	class select_network(nn.Module):
	def __init__(self, args):
	super(select_network, self).__init__()
	self.args = args


	from models.tflocoformer.tflocoformer_separator import TFLocoformer
	self.sep_network = TFLocoformer(args)


	print(f'{args.model_name} running.')


	import os
	from transformers import AutoTokenizer, T5EncoderModel
	model_path = snapshot_download(repo_id="alibabasglab/t5-base")
	model_path = os.path.join(model_path, "t5-base")
	# model_path = hf_hub_download(repo_id="alibabasglab/t5-base", filename="t5-base")
	self.tokenizer =AutoTokenizer.from_pretrained(model_path, model_max_length=512)
	self.text_encoder = T5EncoderModel.from_pretrained(model_path)
	# os.environ["TOKENIZERS_PARALLELISM"] = "false"
	for param in self.text_encoder.parameters():
	param.requires_grad = False



	from models.beats.BEATs import BEATs, BEATsConfig
	model_path = snapshot_download(repo_id="alibabasglab/beats")
	model_path = os.path.join(model_path, "BEATs_iter3_plus_AS2M.pt")
	checkpoint = torch.load(model_path)
	cfg = BEATsConfig(checkpoint['cfg'])
	self.BEATs_model = BEATs(cfg)
	self.BEATs_model.load_state_dict(checkpoint['model'])
	self.BEATs_model.eval()

	for param in self.BEATs_model.parameters():
	param.requires_grad = False



	def forward(self, mixture, t_ref, device):
	mixture = torch.tensor(mixture).to(device)
	mixture = mixture.unsqueeze(0)

	text_input = self.tokenizer(t_ref, return_tensors="pt", truncation=True, padding="longest")
	text_input_ids = text_input["input_ids"].to(device)
	text_attention_mask = text_input["attention_mask"].to(device)
	text_len = torch.sum(text_attention_mask, dim=1)
	text_embedding = self.text_encoder(input_ids=text_input_ids, attention_mask=text_attention_mask).last_hidden_state
	t_ref = (text_embedding.clone().detach(), text_attention_mask.clone().detach(), text_len.clone().detach())


	with torch.no_grad():
	padding_mask = torch.zeros_like(mixture).bool()
	a_ref = self.BEATs_model.extract_features(mixture, padding_mask=padding_mask)[0]
	a_ref = a_ref.transpose(1,2)
	return self.forword_step(mixture, t_ref, a_ref.clone().detach())



	def forword_step(self, mixture, t_ref, a_ref):
	return self.sep_network(mixture, t_ref, a_ref)




	class network_wrapper(SpeechModel):
	def __init__(self, args):
	# Initialize the parent SpeechModel class
	super(network_wrapper, self).__init__(args)

	# Import the AV-MossFormer2 model for 16 kHz target speech enhancement

	# Initialize the model
	self.model = select_network(args)

	# Load pre-trained model checkpoint
	self.load_model()

	# Move model to the appropriate device (GPU/CPU)
	self.model.to(self.device)

	# Set the model to evaluation mode (no gradient calculation)
	self.model.eval()