diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..05f25f4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +numpy +cython +librosa>=0.8.0 +numba==0.54.1 +scipy +mido>=1.2.6 +pytest +madmom +torch +Matplotlib diff --git a/src/BeatNet/BeatNet.py b/src/BeatNet/BeatNet.py index a449fb9..8ae4788 100644 --- a/src/BeatNet/BeatNet.py +++ b/src/BeatNet/BeatNet.py @@ -9,11 +9,11 @@ import torch import numpy as np from madmom.features import DBNDownBeatTrackingProcessor -from BeatNet.particle_filtering_cascade import particle_filter_cascade -from BeatNet.log_spect import LOG_SPECT +from particle_filtering_cascade import particle_filter_cascade +from log_spect import LOG_SPECT import librosa import sys -from BeatNet.model import BDA +from model import BDA import pyaudio import matplotlib.pyplot as plt import time @@ -89,8 +89,8 @@ def __init__(self, model, mode='online', inference_model='PF', plot=[], thread=F rate=self.sample_rate, input=True, frames_per_buffer=self.log_spec_hop_length,) - - def process(self, audio_path=None): + # gives output from self.pred + def process(self, audio_path=None): # takes audio path if self.mode == "stream": if self.inference_model != "PF": raise RuntimeError('The infernece model should be set to "PF" for the streaming mode!') @@ -138,7 +138,7 @@ def process(self, audio_path=None): output = self.estimator(preds) # Using DBN offline inference to infer beat/downbeats return output - + elif self.mode == "offline": if self.inference_model != "DBN": raise RuntimeError('The infernece model should be set to "DBN" for the offline mode!') @@ -150,7 +150,7 @@ def process(self, audio_path=None): else: raise RuntimeError('An audio object or file directory is required for the offline usage!') - + def activation_extractor_stream(self): # TODO: ''' Streaming window @@ -164,15 +164,25 @@ def activation_extractor_stream(self): self.pred = np.zeros([1,2]) else: feats = self.proc.process_audio(self.stream_window).T[-1] + print(feats.shape,'is the shape of feats in model after it is fed to the model, extracting last') feats = torch.from_numpy(feats) feats = feats.unsqueeze(0).unsqueeze(0).to(self.device) + print(feats.shape,'is the shape of featsthat the DL model takes while loading to model') pred = self.model(feats)[0] + print(pred.shape,'is the shape of pred after model, 0 is taken') pred = self.model.final_pred(pred) pred = pred.cpu().detach().numpy() + print(pred.shape,'is the shape of pred after detaching converted to numpy') self.pred = np.transpose(pred[:2, :]) - def activation_extractor_realtime(self, audio_path): + def activation_extractor_realtime(self, audio_path: str) -> None: + ''' + Extracts activations from the audio data in real-time. + + Parameters: + audio_path (str): Path to the audio file. + ''' with torch.no_grad(): if self.counter==0: #loading the audio if isinstance(audio_path, str): @@ -196,20 +206,34 @@ def activation_extractor_realtime(self, audio_path): self.completed = 1 - def activation_extractor_online(self, audio_path): + + def activation_extractor_online(self, audio_path: str) -> np.ndarray: + ''' + Extracts activations from the audio data online. + + Parameters: + audio_path (str): Path to the audio file. + + Returns: + np.ndarray: A numpy array containing the extracted activations. + ''' with torch.no_grad(): if isinstance(audio_path, str): - audio, _ = librosa.load(audio_path, sr=self.sample_rate) # reading the data - elif len(np.shape(audio_path))>1: - audio = np.mean(audio_path ,axis=1) + audio, _ = librosa.load(audio_path, sr=self.sample_rate) # reading the data + elif len(np.shape(audio_path)) > 1: + audio = np.mean(audio_path, axis=1) else: audio = audio_path feats = self.proc.process_audio(audio).T feats = torch.from_numpy(feats) feats = feats.unsqueeze(0).to(self.device) + print(feats.shape,'is the shape of feats while loading to model') preds = self.model(feats)[0] # extracting the activations by passing the feature through the NN preds = self.model.final_pred(preds) preds = preds.cpu().detach().numpy() + print(preds.shape,'is the shape of preds after detaching') + preds = np.transpose(preds[:2, :]) + print(preds.shape,'is the shape of preds after transposing, it is final') return preds diff --git a/src/BeatNet/__pycache__/BeatNet.cpython-38.pyc b/src/BeatNet/__pycache__/BeatNet.cpython-38.pyc new file mode 100644 index 0000000..57d2c3b Binary files /dev/null and b/src/BeatNet/__pycache__/BeatNet.cpython-38.pyc differ diff --git a/src/BeatNet/__pycache__/common.cpython-38.pyc b/src/BeatNet/__pycache__/common.cpython-38.pyc new file mode 100644 index 0000000..2fae520 Binary files /dev/null and b/src/BeatNet/__pycache__/common.cpython-38.pyc differ diff --git a/src/BeatNet/__pycache__/log_spect.cpython-38.pyc b/src/BeatNet/__pycache__/log_spect.cpython-38.pyc new file mode 100644 index 0000000..e42b37f Binary files /dev/null and b/src/BeatNet/__pycache__/log_spect.cpython-38.pyc differ diff --git a/src/BeatNet/__pycache__/model.cpython-38.pyc b/src/BeatNet/__pycache__/model.cpython-38.pyc new file mode 100644 index 0000000..0db3324 Binary files /dev/null and b/src/BeatNet/__pycache__/model.cpython-38.pyc differ diff --git a/src/BeatNet/__pycache__/particle_filtering_cascade.cpython-38.pyc b/src/BeatNet/__pycache__/particle_filtering_cascade.cpython-38.pyc new file mode 100644 index 0000000..7ae601a Binary files /dev/null and b/src/BeatNet/__pycache__/particle_filtering_cascade.cpython-38.pyc differ diff --git a/src/BeatNet/dataloader.py b/src/BeatNet/dataloader.py new file mode 100644 index 0000000..103539e --- /dev/null +++ b/src/BeatNet/dataloader.py @@ -0,0 +1,106 @@ +import torch +from torch.utils.data import Dataset +import numpy as np +import os +import librosa # Ensure librosa is imported +from common import * +from log_spect import LOG_SPECT + +class BeatNetDataset(Dataset): + def __init__(self, audio_dir, target_dir): + self.audio_dir = audio_dir + self.target_dir = target_dir + + self.audio_path = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')] + self.target_path = [os.path.join(target_dir, f) for f in os.listdir(target_dir) if f.endswith('.beats')] + + if len(self.audio_path) != len(self.target_path): + raise ValueError('Number of audio files and target files do not match') + + + + + self.data_names = self._get_data_list() + self.sample_rate = 22050 + self.log_spec_sample_rate = self.sample_rate + self.log_spec_hop_length = int(20 * 0.001 * self.log_spec_sample_rate) + self.log_spec_win_length = int(64 * 0.001 * self.log_spec_sample_rate) + + self.proc = LOG_SPECT(sample_rate=self.log_spec_sample_rate, win_length=self.log_spec_win_length, + hop_size=self.log_spec_hop_length, n_bands=[24], mode = 'online') + + def __len__(self): + return len(self.audio_path) + + def __getitem__(self, idx): + data = self._get_data(self.audio_path[idx]) + target = self._get_targets(self.target_path[idx]) + return data, target + + def _get_data(self, audio_path): + audio, _ = librosa.load(audio_path, sr=self.sample_rate) + if audio.ndim > 1: + audio = np.mean(audio, axis=1) + feats = self.proc.process_audio(audio).T + feats = torch.from_numpy(feats) + feats = feats.unsqueeze(0) # Assuming you want a 4D tensor [1, C, H, W] + return feats + + def _get_targets(self, target_path): + target_list = [] + with open(target_path, 'r') as f: + for line in f: + parsed = self._text_label_to_float(line) + target_list.append(parsed) + + # Using the shape of features obtained from the first audio file + sample_feats = self._get_data(self.audio_path[0]) + beat_vector = np.zeros((sample_feats.shape[1], 3)) + + beat_times = np.array([x[0] for x in target_list]) * self.sample_rate + + for time in beat_times: + spec_frame = min(int(time / self.log_spec_hop_length), beat_vector.shape[0] - 1) + for n in range(-2, 3): + if 0 <= spec_frame + n < beat_vector.shape[0]: + beat_vector[spec_frame + n] = 1.0 if n == 0 else 0.5 + + return torch.tensor(beat_vector) + + def _get_data_list(self): + names = [] + for entry in os.scandir(self.target_dir): + names.append(os.path.splitext(entry.name)[0]) + return names + + def _text_label_to_float(self, text): + allowed = '1234567890. \t' + filtered = ''.join([c for c in text if c in allowed]) + if '\t' in filtered: + t = filtered.rstrip('\n').split('\t') + else: + t = filtered.rstrip('\n').split(' ') + return float(t[0]), float(t[1]) + +if __name__ == '__main__': + # Test dataloader + audio_dir = '/home/nikhil/moji/BeatNet/test/test_data/wav' + target_dir = '/home/nikhil/moji/BeatNet/test/test_data/beats' + + try: + dataset = BeatNetDataset(audio_dir, target_dir) + + # Fetch the first sample + sample_data, sample_target = dataset[2] + + # Print data and target shapes + print('Input data shape:', sample_data.shape) + print('Target shape:', sample_target.shape) + print('Target:', sample_target) + print('Data', sample_data) + + # Print the length of the dataset + print('Dataset length:', len(dataset)) + + except Exception as e: + print(f"An error occurred: {e}") diff --git a/src/BeatNet/log_spect.py b/src/BeatNet/log_spect.py index 6fab105..d3b6165 100644 --- a/src/BeatNet/log_spect.py +++ b/src/BeatNet/log_spect.py @@ -1,5 +1,55 @@ -# Author: Mojtaba Heydari +# # Author: Mojtaba Heydari + + +# from madmom.audio.signal import SignalProcessor, FramedSignalProcessor +# from madmom.audio.stft import ShortTimeFourierTransformProcessor +# from madmom.audio.spectrogram import ( +# FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, +# SpectrogramDifferenceProcessor) +# from madmom.processors import ParallelProcessor, SequentialProcessor +# from BeatNet.common import * +# import numpy as np + + + +# # feature extractor that extracts magnitude spectrogoram and its differences + +# class LOG_SPECT(FeatureModule): +# def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'): +# sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate) +# self.sample_rate = sample_rate +# self.hop_length = hop_size +# self.num_channels = num_channels +# multi = ParallelProcessor([]) +# frame_sizes = [win_length] +# num_bands = n_bands +# for frame_size, num_bands in zip(frame_sizes, num_bands): +# if mode == 'online' or mode == 'offline': +# frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size) +# else: # for real-time and streaming modes +# frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4) +# stft = ShortTimeFourierTransformProcessor() # caching FFT window +# filt = FilteredSpectrogramProcessor( +# num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True) +# spec = LogarithmicSpectrogramProcessor(mul=1, add=1) +# diff = SpectrogramDifferenceProcessor( +# diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) +# # process each frame size with spec and diff sequentially +# multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) +# # stack the features and processes everything sequentially +# self.pipe = SequentialProcessor((sig, multi, np.hstack)) +# def process_audio(self, audio): +# feats = self.pipe(audio) +# return feats.T + +# if __name__ == '__main__': +# # test the feature extraction module and get features for a sample audio file +# audio_path = '/home/nikhil/moji/BeatNet/src/BeatNet/test_data/808kick120bpm.mp3' +# feats = LOG_SPECT().process_audio(audio_path) +# print(feats) + +# Author: Mojtaba Heydari from madmom.audio.signal import SignalProcessor, FramedSignalProcessor from madmom.audio.stft import ShortTimeFourierTransformProcessor @@ -7,37 +57,52 @@ FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor, SpectrogramDifferenceProcessor) from madmom.processors import ParallelProcessor, SequentialProcessor -from BeatNet.common import * +from common import FeatureModule +import numpy as np +import sys +sys.path.append('/home/nikhil/moji/BeatNet/src/BeatNet') -# feature extractor that extracts magnitude spectrogoram and its differences +# Feature extractor that extracts magnitude spectrogram and its differences class LOG_SPECT(FeatureModule): def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'): - sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate) + sig = SignalProcessor(num_channels=num_channels, sample_rate=sample_rate) self.sample_rate = sample_rate self.hop_length = hop_size self.num_channels = num_channels multi = ParallelProcessor([]) frame_sizes = [win_length] num_bands = n_bands - for frame_size, num_bands in zip(frame_sizes, num_bands): - if mode == 'online' or mode == 'offline': - frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size) - else: # for real-time and streaming modes - frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4) + + for frame_size, num_band in zip(frame_sizes, num_bands): + if mode in ['online', 'offline']: + frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size) + else: # for real-time and streaming modes + frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4) + stft = ShortTimeFourierTransformProcessor() # caching FFT window - filt = FilteredSpectrogramProcessor( - num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True) + + filt = FilteredSpectrogramProcessor(num_bands=num_band, fmin=30, fmax=17000, norm_filters=True) + spec = LogarithmicSpectrogramProcessor(mul=1, add=1) - diff = SpectrogramDifferenceProcessor( - diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) - # process each frame size with spec and diff sequentially + + diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack) + + + # Process each frame size with spec and diff sequentially multi.append(SequentialProcessor((frames, stft, filt, spec, diff))) - # stack the features and processes everything sequentially + + # Stack the features and process everything sequentially self.pipe = SequentialProcessor((sig, multi, np.hstack)) def process_audio(self, audio): feats = self.pipe(audio) return feats.T +if __name__ == '__main__': + + # Test the feature extraction module and get features for a sample audio file + audio_path = '/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav' + op = LOG_SPECT().process_audio(audio_path) + print(op.shape) diff --git a/src/BeatNet/model.py b/src/BeatNet/model.py index b65c1a4..c559313 100644 --- a/src/BeatNet/model.py +++ b/src/BeatNet/model.py @@ -38,16 +38,25 @@ def __init__(self, dim_in, num_cells, num_layers, device): def forward(self, data): x = data + print(np.shape(x), 'data in model looks like') x = torch.reshape(x, (-1, self.dim_in)) + print(np.shape(x), 'after reshape') x = x.unsqueeze(0).transpose(0, 1) + print(np.shape(x), 'after unsqueeze and transpose') x = F.max_pool1d(F.relu(self.conv1(x)), 2) + print(np.shape(x), 'after max_pool1d') x = x.view(-1, self.num_flat_features(x)) + print(np.shape(x), 'after view') x = self.linear0(x) + print(np.shape(x), 'after linear0') x = torch.reshape(x, (np.shape(data)[0], np.shape(data)[1], self.conv_out)) + print(np.shape(x), 'after reshape') x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell)) - # x = self.lstm(x)[0] + print(np.shape(x), 'after lstm') out = self.linear(x) + print(np.shape(out), 'after linear') out = out.transpose(1, 2) + print(np.shape(out), 'final output shape') return out def final_pred(self, input): diff --git a/src/BeatNet/test.py b/src/BeatNet/test.py new file mode 100644 index 0000000..243b84b --- /dev/null +++ b/src/BeatNet/test.py @@ -0,0 +1,8 @@ +from BeatNet import BeatNet + +estimator = BeatNet(1, mode='online', inference_model='PF', plot=['activations'], thread=False) + + +Output = estimator.process('/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav') + +print(Output) \ No newline at end of file diff --git a/src/BeatNet/train.py b/src/BeatNet/train.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_data/beats/Albums-AnaBelen_Veneo-01.beats b/test/test_data/beats/Albums-AnaBelen_Veneo-01.beats new file mode 100644 index 0000000..1f535e7 --- /dev/null +++ b/test/test_data/beats/Albums-AnaBelen_Veneo-01.beats @@ -0,0 +1,49 @@ +0.1944671200 4 +0.8499773240 1 +1.4700226750 2 +2.0800000000 3 +2.6800000000 4 +3.3000453510 1 +3.9299773240 2 +4.5299773240 3 +5.1500226750 4 +5.7600000000 1 +6.3600000000 2 +6.9600000000 3 +7.5900226750 4 +8.2000000000 1 +8.8200453510 2 +9.4300226750 3 +10.0499773240 4 +10.6600453510 1 +11.2800000000 2 +11.8899773240 3 +12.5000453510 4 +13.1200000000 1 +13.7100226750 2 +14.3400453510 3 +14.9600000000 4 +15.5600000000 1 +16.1699773240 2 +16.7800453510 3 +17.4099773240 4 +18.0200453510 1 +18.6600453510 2 +19.2700226750 3 +19.8700226750 4 +20.4899773240 1 +21.0899773240 2 +21.7100226750 3 +22.3100226750 4 +22.9400453510 1 +23.5600000000 2 +24.1699773240 3 +24.7699773240 4 +25.4000000000 1 +25.9800453510 2 +26.6000000000 3 +27.2200453510 4 +27.8300226750 1 +28.4499773240 2 +29.0600453510 3 +29.6600453510 4 diff --git a/test/test_data/beats/Albums-AnaBelen_Veneo-02.beats b/test/test_data/beats/Albums-AnaBelen_Veneo-02.beats new file mode 100644 index 0000000..015662a --- /dev/null +++ b/test/test_data/beats/Albums-AnaBelen_Veneo-02.beats @@ -0,0 +1,47 @@ +2.0200000000 4 +2.6280000000 1 +3.2370000000 2 +3.8380000000 3 +4.4330000000 4 +5.0300000000 1 +5.6310000000 2 +6.2300000000 3 +6.8240000000 4 +7.4260000000 1 +8.0340000000 2 +8.6440000000 3 +9.2440000000 4 +9.8430000000 1 +10.4470000000 2 +11.0510000000 3 +11.6500000000 4 +12.2470000000 1 +12.8440000000 2 +13.4400000000 3 +14.0370000000 4 +14.6400000000 1 +15.2470000000 2 +15.8570000000 3 +16.4610000000 4 +17.0630000000 1 +17.6670000000 2 +18.2710000000 3 +18.8700000000 4 +19.4670000000 1 +20.0700000000 2 +20.6740000000 3 +21.2600000000 4 +21.8530000000 1 +22.4590000000 2 +23.0760000000 3 +23.6780000000 4 +24.2740000000 1 +24.8600000000 2 +25.4650000000 3 +26.0800000000 4 +26.6840000000 1 +27.2710000000 2 +27.8540000000 3 +28.4410000000 4 +29.0430000000 1 +29.6400000000 2 diff --git a/test/test_data/beats/Albums-AnaBelen_Veneo-03.beats b/test/test_data/beats/Albums-AnaBelen_Veneo-03.beats new file mode 100644 index 0000000..a98ddde --- /dev/null +++ b/test/test_data/beats/Albums-AnaBelen_Veneo-03.beats @@ -0,0 +1,60 @@ +0.4100000000 1 +0.9000000000 2 +1.3940000000 3 +1.8900000000 4 +2.3880000000 1 +2.8870000000 2 +3.3880000000 3 +3.8910000000 4 +4.3940000000 1 +4.8970000000 2 +5.3980000000 3 +5.8980000000 4 +6.3990000000 1 +6.9010000000 2 +7.4030000000 3 +7.9060000000 4 +8.4080000000 1 +8.9100000000 2 +9.4110000000 3 +9.9120000000 4 +10.4140000000 1 +10.9150000000 2 +11.4170000000 3 +11.9180000000 4 +12.4200000000 1 +12.9210000000 2 +13.4220000000 3 +13.9220000000 4 +14.4230000000 1 +14.9220000000 2 +15.4220000000 3 +15.9200000000 4 +16.4200000000 1 +16.9210000000 2 +17.4240000000 3 +17.9270000000 4 +18.4310000000 1 +18.9370000000 2 +19.4440000000 3 +19.9500000000 4 +20.4550000000 1 +20.9590000000 2 +21.4620000000 3 +21.9640000000 4 +22.4650000000 1 +22.9620000000 2 +23.4530000000 3 +23.9550000000 4 +24.4610000000 1 +24.9700000000 2 +25.4700000000 3 +25.9690000000 4 +26.4680000000 1 +26.9650000000 2 +27.4630000000 3 +27.9650000000 4 +28.4670000000 1 +28.9750000000 2 +29.4590000000 3 +29.9530000000 4 diff --git a/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav b/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav new file mode 100644 index 0000000..05c7e58 Binary files /dev/null and b/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav differ diff --git a/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav b/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav new file mode 100644 index 0000000..5ca8ff7 Binary files /dev/null and b/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav differ diff --git a/test/test_data/wav/Albums-AnaBelen_Veneo-03.wav b/test/test_data/wav/Albums-AnaBelen_Veneo-03.wav new file mode 100644 index 0000000..5c415da Binary files /dev/null and b/test/test_data/wav/Albums-AnaBelen_Veneo-03.wav differ