From 5d971db8d80191c24972e7b2e62f02f8fbf6887f Mon Sep 17 00:00:00 2001 From: Ingmars Melkis Date: Wed, 17 Jul 2019 13:48:23 +0300 Subject: [PATCH] Published code to github/gitlab --- .gitignore | 16 +++++ LICENSE | 7 +++ README.MD | 59 ++++++++++++++++++ config.py | 31 ++++++++++ dataset.py | 76 +++++++++++++++++++++++ evaluate.py | 96 ++++++++++++++++++++++++++++++ main.py | 88 +++++++++++++++++++++++++++ misc/README.MD | 24 ++++++++ misc/apply_vocal_mask.py | 63 ++++++++++++++++++++ misc/cnn_output_plot.py | 52 ++++++++++++++++ misc/create_binmask.py | 40 +++++++++++++ misc/create_spectrogram.py | 29 +++++++++ misc/test_tensorflow.py | 3 + model.py | 107 +++++++++++++++++++++++++++++++++ requirements.txt | 62 +++++++++++++++++++ song.py | 119 +++++++++++++++++++++++++++++++++++++ structure.md | 39 ++++++++++++ 17 files changed, 911 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.MD create mode 100644 config.py create mode 100644 dataset.py create mode 100644 evaluate.py create mode 100644 main.py create mode 100644 misc/README.MD create mode 100644 misc/apply_vocal_mask.py create mode 100644 misc/cnn_output_plot.py create mode 100644 misc/create_binmask.py create mode 100644 misc/create_spectrogram.py create mode 100644 misc/test_tensorflow.py create mode 100644 model.py create mode 100644 requirements.txt create mode 100644 song.py create mode 100644 structure.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5bda8aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +__pycache__/* +*.sw[a-p] +*.ini +log.* +*.log +data/* +test +test-valid +ignored/* +*.save +*.weights +*.out +*.wav +*.h5 +*.csv +*.png diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..758d5d7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright 2019 Ingmars Daniels Melkis + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.MD b/README.MD new file mode 100644 index 0000000..326c829 --- /dev/null +++ b/README.MD @@ -0,0 +1,59 @@ +Vocal and music seperation using a CNN +=== + +# Description + +This CNN attempts to separate the vocals from the music. It does so by training on the amplitude data of the audio file and tries to estimate where the voiced parts are. Vocal separation is done by generating a binary mask of the time-frequency bins that the network thinks contain the vocals and applying it to the original file. + +# Requirements + +* Python 3 +* Tensorflow (Tested with tensorflow-gpu), Keras +* And a few other python libraries that you can install by running `pip install -r requirements.txt` in the root directory + +# Dataset + +* The script was only tested with .wav files (16-bit and 24-bit signed wavs should work, 32-bit float doesn't). Other formats might work if your version of librosa is capable of opening it. +* Training data folder should have individual folder for each song. Each song should have two files - `mixture.wav` (the full song) and `vocals.wav` (original vocals). See below for a list of data sets that you could potentially use to train this network. +* To see an example of how the directory structures should look like, refer to [structure.md](structure.md). +* To make things faster, all songs should have the same sampling rate as configured (I only tested 22050kHz, but other sample rates should work) and should be in mono (if it isn't, the script will convert them, but the result isn't saved anywhere and it takes a while). + +## Example data sets + +* DSD100: https://sigsep.github.io/datasets/dsd100.html +* MUSDB: https://sigsep.github.io/datasets/musdb.html +* MedleyDB: https://medleydb.weebly.com/ + +# Setting up + +1. pip install -r requirements.txt +2. py main.py + +# Running + +1. `python main.py -h` to see all arguments +2. `python main.py` will train the network with the default options +3. `python main.py --mode=separate --file=audio.wav` will attempt source separation on `audio.wav` and will output `vocals.wav` +4. `python main.py --mode=evaluate` will evaluate the effectiveness of audio source separation. More information below. + +## Configuring + +All relevant settings are located in the `config.ini` file. The file doesn't exist in the repository and will be automatically created and prepopulated with the default values on first run. For information on what each option does see `config.py`. + +## Evaluating + +This program also includes a simple wrapper around BSS-Eval which can be used to determine how effective audio source separation is. To use it you need - the original vocals (`vocals.wav`), the original accompaniment (`accompaniment.wav`), estimated vocals (`estimated-vocals.wav`) and estimated accompaniment (`estimated-accompaniment.wav`). If you don't have the accompaniment but have a mixture and vocals, you can use the `apply_vocal_mask.py` script in the `misc` folder. To get estimated accompaniment, you need to perform separation with the `--save_accompaniment` flag set to true. After you have all the files, create a data directory that contains a directory with the name of the song and copy all 4 files to it. + +Note that librosa by default outputs a 32bit wav file which it can't load without ffmpeg, so you either need to add an extra conversion step between separating and evaluation or install ffmpeg and add it to your PATH. Both files need to have the same format and bitrate for evaluation to be successful as well. + +### Bug + +For a reason I haven't had the time to determine yet the neural network output sometimes has slightly less samples (~76 samples to be exact which is around 0.004s worth of samples) than the original. The evaluation script will account for this, but be advised that some samples are being lost during evaluation. + +## Weights files when training + +While training the network will save its weights every 5 epochs to avoid data loss should you have a power failure or a similar issue. These files may be deleted after training. + +## Misc + +The misc directory contains a few scripts that might be useful but aren't required to run the neural net. diff --git a/config.py b/config.py new file mode 100644 index 0000000..a459bc3 --- /dev/null +++ b/config.py @@ -0,0 +1,31 @@ +import configparser + +def prepare_config(filename): + config = configparser.ConfigParser() + config.read(filename) + + # Set defaults + config_get(config, 'logging', 'logfile', 'log.txt') + config_get(config, 'logging', 'loglevel', 'INFO') #debug,info,warning,critical + config_get(config, 'logging', 'logtype', 'console') #file/console + + config_get(config, 'song', 'sample_size', "22050") #Sample rate of the audio we will work with. If loaded audio doesn't match, it will be resampled. + config_get(config, 'song', 'window_size', "1024") #We will get window size / 2 + 1 frequency bins to work with. 1024-1568 seems to be the perfect vales. + config_get(config, 'song', 'hop_length', "256") #Size of each bin = hop size / sample size (in ms). The smaller it is, the more bins we get, but we don't need that much resolution. + config_get(config, 'song', 'sample_length', "25") #Dictates how many frequency bins we give to the neural net for context. Less samples means more guesswork from the network, but also more samples from each song. + + config_get(config, 'model', 'save_history', "true") #Saves keras accuracy and loss history per epoch + config_get(config, 'model', 'history_filename', "history.csv") + + with open(filename, 'w') as configfile: # If the file didn't exist, write default values to it + config.write(configfile) + return config + +def config_get(config, section, key, default): + try: + config.get(section, key) + except configparser.NoSectionError: + config.add_section(section) + config.set(section, key, default) + except configparser.NoOptionError: + config.set(section, key, default) diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..b2fd84a --- /dev/null +++ b/dataset.py @@ -0,0 +1,76 @@ +import os +import sys +import logging +from song import Song +import numpy as np + +# Dataset: Loads and passes test data to the model +class Dataset: + def __init__(self, logger, config): + self.logger=logger + self.config=config + # Raw data + self.mixtures = [] + self.vocals = [] + # Outputs for CNN + self.mixture_windows = [] + self.labels = [] + + # Load mixture and vocals and generates STFT for them + def load(self, folder): + if os.path.isdir(folder): + for root, dirs, files in os.walk(folder): + for file in filter(lambda f: f.endswith(".wav"), files): + self.logger.info("Loading song %s and computing stft for it.", os.path.join(root, file)) + song_type = os.path.splitext(file)[0].lower() + if song_type == "mixture" or song_type == "vocals": + song = Song(self.logger, os.path.basename(root), self.config) + song.load_file(os.path.join(root,file)) + song.compute_stft() + if(song_type == "mixture"): + self.mixtures.append(song) + elif(song_type == "vocals"): + self.vocals.append(song) + self.logger.debug("%s loaded successfully.", song_type) + else: + self.logger.debug("File %s is not named correctly. Ignoring...", song_type) + else: + self.logger.critical("Folder %s does not exist!", folder) + sys.exit(8) + if(len(self.mixture) != len(self.vocals)): + self.logger.critical("There doesn't appear to be a vocal track for each mixture (or the other way around).") + sys.exit(15) + + def get_data_for_cnn(self): + length = self.config.getint("song", "sample_length") + self.logger.info("Preparing data of type 'mixture' for the CNN...") + if len(self.mixtures) == 0: + self.logger.critical("No mixtures for training found. Did you name them wrong?") + sys.exit(9) + self.logger.debug("Preparing %i songs...", len(self.mixtures)) + amplitudes = None + for num in range(0, len(self.mixtures)): + if amplitudes is None: + amplitudes = self.mixtures[0].split_spectrogram(length) + else: + amplitudes = np.vstack((amplitudes, self.mixtures[0].split_spectrogram(length))) + del self.mixtures[0] + self.logger.debug("Got %i slices. Each slice has %i frequency bins, and each frequency bin has %i time slices.", len(amplitudes), len(amplitudes[0]), len(amplitudes[0][0])) + self.logger.debug("Adding a 4th dimension to placate the CNN model...") + # Add a dimension to make the CNN accept the data. Signifies that we have a greyscale "picture" + amplitudes = np.array(amplitudes).reshape(len(amplitudes), len(amplitudes[0]), len(amplitudes[0][0]), 1) + self.mixture_windows = amplitudes + + def get_labels_for_cnn(self): + length = self.config.getint("song", "sample_length") + self.logger.info("Preparing data of type 'vocals' for the CNN...") + if len(self.vocals) == 0: + self.logger.critical("No original vocals for training found. Did you name them wrong?") + sys.exit(10) + self.logger.debug("Preparing %i songs...", len(self.vocals)) + labels = [] + for num in range(0, len(self.vocals)): + labels.extend(self.vocals[0].get_labels(length)) + del self.vocals[0] + self.logger.debug("Got %i slices.", len(labels)) + self.labels = np.array(labels) diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..41cde96 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,96 @@ +# Evaluate the accuracy of the neural network by calculating SDR (distortion) +# SIR (interference from other sources) and SAR (artifacts) +import numpy as np +import museval +import os +import sys +from song import Song + +class Evaluator: + def __init__(self, logger, config): + self.logger=logger + self.config=config + self.vocals=None + self.accompaniments=None + self.estimated_vocals=None + self.estimated_accompaniments=None + self.names=None + + def load_data(self, folder): + self.vocals=[] + self.accompaniments=[] + self.estimated_vocals=[] + self.estimated_accompaniments=[] + if os.path.isdir(folder): + for root, firs, files in os.walk(folder): + for file in filter(lambda f: f.endswith(".wav"), files): + song_type = os.path.splitext(file)[0].lower() + self.logger.info("Loading song %s.", os.path.join(root, file)) + if song_type == "vocals" or song_type == "accompaniment" or song_type == "estimated_vocals" or song_type == "estimated_accompaniment": + song = Song(self.logger, os.path.basename(root), self.config) + song.load_file(os.path.join(root,file)) + if(song_type == "vocals"): + self.vocals.append(song) + elif(song_type == "accompaniment"): + self.accompaniments.append(song) + elif(song_type == "estimated_vocals"): + self.estimated_vocals.append(song) + elif(song_type == "estimated_accompaniment"): + self.estimated_accompaniments.append(song) + self.logger.debug("%s loaded successfully.", song_type) + else: + self.logger.debug("File %s is not named correctly. Ignoring...", song_type) + else: + self.logger.critical("Folder %s does not exist!", folder) + sys.exit(13) + if (len(self.vocals) != len(self.accompaniments)) or (len(self.accompaniments) != len(self.estimated_vocals)) or (len(self.estimated_vocals) != len(self.estimated_accompaniments)): + self.logger.critical("Array size mismatch. Did you misname a file?") + sys.exit(14) + + # Extracts data from the dataset and sets the correct dimensions + def prepare_data(self): + self.names = [] + for element in range(0, len(self.vocals)): + self.logger.debug("Processing %s...", self.vocals[element].get_name()) + self.names.append(self.vocals[element].get_name()) + self.vocals[element] = np.expand_dims(self.vocals[element].get_raw_data(), 1) + self.accompaniments[element] = np.expand_dims(self.accompaniments[element].get_raw_data(), 1) + self.estimated_vocals[element] = np.expand_dims(self.estimated_vocals[element].get_raw_data(), 1) + self.estimated_accompaniments[element] = np.expand_dims(self.estimated_accompaniments[element].get_raw_data(), 1) + self.vocals = np.array(self.vocals) + self.accompaniments = np.array(self.accompaniments) + self.estimated_vocals = np.array(self.estimated_vocals) + self.estimated_accompaniments = np.array(self.estimated_accompaniments) + # Since the neural net outputs slightly less data than in the original, we will cut off the part that we can't compare + # Simply padding WOULD be a better idea, but we can't assume that the last few miliseconds have nothing going on in them. + for element in range(0, len(self.vocals)): + if np.shape(self.vocals[element])[0] > np.shape(self.estimated_vocals[element])[0]: + self.logger.debug("Reshaping arrays for %s...", self.names[element]) + difference = np.shape(self.vocals[element])[0] - np.shape(self.estimated_vocals[element])[0] + self.vocals[element] = self.vocals[element,:-difference,:] + self.accompaniments[element] = self.accompaniments[element,:-difference,:] + + def calculate_metrics(self): + sdr = sir = sar = [] + for element in range(0, len(self.vocals)): + original_data = np.stack((self.vocals[element], self.accompaniments[element])) + estimated_data = np.stack((self.estimated_vocals[element], self.estimated_accompaniments[element])) + museval.metrics.validate(original_data, estimated_data) + self.logger.info("Calculating metrics for %s...", self.names[element]) + obtained_sdr, _, obtained_sir, obtained_sar, _ = museval.metrics.bss_eval(original_data, estimated_data, window=np.inf, hop=0) + if element == 0: + sdr = obtained_sdr + sir = obtained_sir + sar = obtained_sar + else: + sdr = np.column_stack((sdr, obtained_sdr)) + sir = np.column_stack((sir, obtained_sir)) + sar = np.column_stack((sar, obtained_sar)) + return sdr, sir, sar + + def print_metrics(self, sdr, sir, sar): + self.logger.info("Printing results...") + for element in range(0, len(self.names)): + self.logger.info("Song name: %s", self.names[element]) + self.logger.info("Vocals: SDR: %.2f, SIR: %.2f, SAR: %.2f", sdr[0][element], sir[0][element], sar[0][element]) + self.logger.info("Accompaniments: SDR: %.2f, SIR: %.2f, SAR: %.2f", sdr[1][element], sir[1][element], sar[1][element]) diff --git a/main.py b/main.py new file mode 100644 index 0000000..e5cd4f7 --- /dev/null +++ b/main.py @@ -0,0 +1,88 @@ +# Entry point for a script to attempt vocal and music separation using Tensorflow +# Accompanying thesis: Vocal and Audio separation using deep learning for Riga Technical University. +# Author: Ingmars Daniels Melkis , Student ID: 161RDB280 + +import logging +import argparse +import os +import sys +from dataset import Dataset +from model import Model +from song import Song +from config import prepare_config +from evaluate import Evaluator + +# Set up - Load config, arguments and set up logging +config = prepare_config('config.ini') +if config.get("logging", "logtype") == "file": + logging.basicConfig(filename=config.get('logging', 'logfile'), level=logging.getLevelName(config.get('logging', 'loglevel')), filemode='a', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') +elif config.get("logging", "logtype") == "console": + logging.basicConfig(level=logging.getLevelName(config.get('logging', 'loglevel')), format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') +logging.addLevelName(55, "Hello!") +logging.addLevelName(56, "Goodbye!") + +parser = argparse.ArgumentParser(description="Neural network for vocal and music splitting") +parser.add_argument("--mode", default="train", type=str, help="Mode in which the script is run (train/separate/evaluate).") +parser.add_argument("--weights", default="network.weights", type=str, help="File containing the weights to be used with the neural network. Will be created if it doesn't exist. Required for separation. Default is network.weights.") +parser.add_argument("--datadir", default="data", type=str, help="Directory in which the training data is located in. Default is data. (requires --mode=train)") +parser.add_argument("--validationdir", default="data-valid", type=str, help="Directory in which the validation data is located in. Default is data-valid. (requires --mode=train)") +parser.add_argument("--evaluationdir", default="evaluate", type=str, help="Directory in which separated data and the originals are located in. Default is evaluate. (requires --mode=evaluate)") +parser.add_argument("--epochs", default=1, type=int, help="How many times will the network go over the data. default - 1. (requires --mode=train)") +parser.add_argument("--file", default="mixture.wav", type=str, help="Name of the file from which to extract vocals. (requires --mode=separate)") +parser.add_argument("--output", default="vocals.wav", type=str, help="Name of the file to which the vocals will be written to. (requires --mode=separate)") +parser.add_argument("--dump_data", default="false", type=str, help="If set to true, dumps raw data for everything. Takes up a lot of space, but can be potentially useful for comparing results. (requires --mode=separate)") +parser.add_argument("--save_accompaniment", default="false", type=str, help="If set to true, the accompaniment will also be saved as a separate file (requires --mode=separate)") +args = parser.parse_args() + +logging.log(55, 'Script started.') +if args.mode == "train": + logging.info("Preparing to train a model...") + dataset = Dataset(logging, config) + dataset.load(args.datadir) + dataset.get_data_for_cnn() + dataset.get_labels_for_cnn() + validation_set = Dataset(logging, config) + validation_set.load(args.validationdir) + validation_set.get_data_for_cnn() + validation_set.get_labels_for_cnn() + model = Model(logging, config, dataset, validation_set) + model.build(output_summary=True) + if os.path.isfile(args.weights): + logging.info("Found existing weights, loading them...") + model.load(args.weights) + model.train(args.epochs, save_log=config.getboolean("model", "save_history"), log_name=config.get("model", "history_filename")) + logging.info("Saving weights...") + model.save(args.weights) +elif args.mode == "separate": + logging.info("Preparing to separate vocals from instrumentals...") + mixture = Song(logging, "a mixture", config) + mixture.load_file(args.file) + mixture.compute_stft(keep_spectrogram=True) + dump_data = True if args.dump_data.lower() in ("yes", "true", "y", "t", "1") else False + save_accompaniment = True if args.save_accompaniment.lower() in ("yes", "true", "y", "t", "1") else False + if dump_data is True: + mixture.dump_amplitude("original") + mixture.dump_spectrogram("original") + model = Model(logging, config) + model.build() + if os.path.isfile(args.weights): + model.load(args.weights) + else: + logging.critical("Couldn't find a weights file.") + sys.exit(11) + if dump_data is True: + model.isolate(mixture, args.output, save_accompaniment=save_accompaniment, save_original_mask=True, save_original_probabilities=True) + mixture.dump_spectrogram("processed") + else: + model.isolate(mixture, args.output) +elif args.mode == "evaluate": + logging.info("Preparing to evaluate the effectiveness of an output") + evaluator = Evaluator(logging, config) + evaluator.load_data(args.evaluationdir) + evaluator.prepare_data() + sdr, sir, sar = evaluator.calculate_metrics() + evaluator.print_metrics(sdr, sir, sar) +else: + logging.critical("Invalid action - %s", args.mode) + sys.exit(12) +logging.log(56, "Script finished!") diff --git a/misc/README.MD b/misc/README.MD new file mode 100644 index 0000000..cb1da1c --- /dev/null +++ b/misc/README.MD @@ -0,0 +1,24 @@ +Miscellaneous utilities +=== + +This folder contains miscellaneous utilities that aren't required for the CNN, but they might be helpful anyway. + +## `test_tensorflow.py` + +This script lists all the devices tensorflow detects. Useful to see whether tensorflow loads, and if it does - what devices it picks up. Having GPU support helps. + +## `create_spectrogram.py` + +This script generates a spectrogram of an audio file. For more information run - `python create_spectrogram.py -h`. + +## `create_binmask.py` + +Creates a binary mask from an audio file. + +## `apply_vocal_mask.py` + +This script accepts a mixture and a vocal track, processes them the same way it's done when preparing data for the CNN and outputs both vocal and instrumental tracks. Can be used for comparisons. + +## `cnn_output_plot.py` + +Makes a mask of network outputs. Sadly Matplotlib is kind of terrible for datasets of this type, so I don't recommend using it. Accepts a file containing a numpy array of prediction values generated by running the neural net in `separate` mode with the `dump_data` argument set to `true`. diff --git a/misc/apply_vocal_mask.py b/misc/apply_vocal_mask.py new file mode 100644 index 0000000..e71578c --- /dev/null +++ b/misc/apply_vocal_mask.py @@ -0,0 +1,63 @@ +# A tool to obtain instrumentals from a mixture using a vocal track +# This is used to create files that can be compared with CNNs +# output. Because of this it also generates and uses binary masks +# so that any problems caused by that method don't impact the comparison. +import numpy as np +import librosa +import sys +import argparse + +parser = argparse.ArgumentParser(description="A small tool to obtain instrumentals from a mixture provided a vocal track") +parser.add_argument("mixture", default="mixture.wav", type=str, help="Path to the mixture") +parser.add_argument("vocals", default="vocals.wav", type=str, help="Path to the vocal track") +args = parser.parse_args() + +print("Loading files...") +mixture, mixture_sample_rate = librosa.load(args.mixture) +vocals, vocal_sample_rate = librosa.load(args.vocals) + +if mixture_sample_rate != vocal_sample_rate: + print("Sample rates don't match. Resample your data and try again.") + sys.exit(2) + +print("Processing audio data...") +mixture_spectrogram = librosa.stft(mixture, 2048, 256) +vocal_spectrogram = librosa.stft(vocals, 2048, 256) + +vocal_amplitude = librosa.power_to_db(np.abs(vocal_spectrogram)**2) +mixture_magnitude, mixture_phase = librosa.core.magphase(mixture_spectrogram) + +# Create a binary mask for the vocals +# We can actually work with just the raw spectrograms if we need to, +# but that's not the point of this script +print("Generating masks...") +vocal_mask = [] +instrumental_mask = [] +for x in range(0, vocal_amplitude.shape[0]): + vocal_slice = [] + instrumental_slice = [] + for y in range(0, vocal_amplitude.shape[1]): + # This is the same algorithm as used in song.py + # Please refer to that file for more information on what it does + # And the problems this has. + if vocal_amplitude[x][y] > 0.01: + vocal_slice.append(1) + instrumental_slice.append(0) + else: + vocal_slice.append(0) + instrumental_slice.append(1) + vocal_mask.append(vocal_slice) + instrumental_mask.append(instrumental_slice) + +print("Applying masks...") +output_vocals = np.multiply(mixture_magnitude, vocal_mask) * mixture_phase +output_instrumentals = np.multiply(mixture_magnitude, instrumental_mask) * mixture_phase + +print("Processing audio data...") +vocal_data = librosa.istft(output_vocals, 256, 2048) +instrumental_data = librosa.istft(output_instrumentals, 256, 2048) + +print("Outputting files...") +librosa.output.write_wav("processed_vocals.wav", vocal_data, mixture_sample_rate, norm=True) +librosa.output.write_wav("processed_instrumentals.wav", instrumental_data, mixture_sample_rate, norm=True) +print("Completed. Output can be found in processed_vocals.wav and processed_instrumentals.wav.") diff --git a/misc/cnn_output_plot.py b/misc/cnn_output_plot.py new file mode 100644 index 0000000..59510d8 --- /dev/null +++ b/misc/cnn_output_plot.py @@ -0,0 +1,52 @@ +# A script to generate a binary mask's visual representation +import matplotlib.pyplot as plt +import numpy as np +import argparse +import sys + +parser = argparse.ArgumentParser(description="Create a visualisation for NN output") +parser.add_argument("file", type=str, help="Path to the outputs (saved numpy array, i.e. labels.out)") +parser.add_argument("type", type=str, default="binary", nargs='?', help="Type of graph to output (normal/binary)") +parser.add_argument("output", default="", nargs='?', type=str, help="Path to output file (png)") +args = parser.parse_args() + +data = np.loadtxt(args.file) + +fig = plt.figure() +plt.title('Binary mask') + +if args.type == "normal": + # import matplotlib as mpl + # mpl.use('Qt5Agg') #Requires pyqt5 + + x_range_start = 0 + x_range_end = 50 + y_range_start = 845 + y_range_end = 865 + + newdata = data[x_range_start:x_range_end, y_range_start:y_range_end] + plt.imshow(newdata, cmap="Greys", origin='lower') + + ax = plt.gca() + ax.set_xlabel('Time') + ax.set_ylabel('Frequency') + for i in range(0, np.shape(newdata)[0]): + for j in range(0, np.shape(newdata)[1]): + s = '%.2f' % newdata[i][j] + ax.text(j, i, s, fontsize=5, ha='center', va='center') +elif args.type == "binary": + processed = np.empty(np.shape(data)) + processed[data>0.45] = [1] + processed[data<0.45] = [0] + plt.imshow(processed, interpolation='nearest', cmap="Greys", origin='lower') +else: + print("Invalid action - ", args.type) + sys.exit(2) + +if args.output is not "": + plt.savefig(args.output, dpi=1000) + print("Saved to ", args.output) +else: + #plt.rcParams['figure.figsize'] = (1000, 1000) + plt.tight_layout() + plt.show() diff --git a/misc/create_binmask.py b/misc/create_binmask.py new file mode 100644 index 0000000..5908cc6 --- /dev/null +++ b/misc/create_binmask.py @@ -0,0 +1,40 @@ +# A script to generate a binary mask of an audio file using librosa and matplotlib +import matplotlib.pyplot as plt +import numpy as np +import librosa +import librosa.display +import argparse + +parser = argparse.ArgumentParser(description="Create spectrogram from a wav file") +parser.add_argument("file", type=str, help="Path to the file (wav)") +parser.add_argument("mel", type=str, nargs='?', default="false", help="Should this spectrogram be rendered as a mel spectrogram? (true/false)") +parser.add_argument("output", default="", nargs='?', type=str, help="Path to output file (png)") +args = parser.parse_args() +audio, sampleRate = librosa.load(args.file) +plt.figure(figsize=(10,4)) + +mel = True if args.mel.lower() in ("yes", "true", "y", "t", "1") else False +amplitude = None +if mel is True: + amplitude = librosa.power_to_db(np.abs(librosa.feature.melspectrogram(audio, sr=sampleRate))) +else: + amplitude = librosa.power_to_db(np.abs(librosa.stft(audio))) + +slices = [] +for x in range(0, amplitude.shape[0]): + _slice = [] + for y in range(0, amplitude.shape[1]): + if amplitude[x][y] > 0.01: + _slice.append(1) + else: + _slice.append(0) + slices.append(_slice) + +plt.imshow(slices, interpolation='nearest', cmap="Greys", origin='lower') +#librosa.display.specshow(slices, y_axis='mel', fmax=sampleRate, x_axis='time') +plt.title('Binary mask') +plt.tight_layout() +if args.output is not "": + plt.savefig(args.output) +else: + plt.show() diff --git a/misc/create_spectrogram.py b/misc/create_spectrogram.py new file mode 100644 index 0000000..10dfc3e --- /dev/null +++ b/misc/create_spectrogram.py @@ -0,0 +1,29 @@ +# A script to generate a spectrogram of an audio file using librosa and matplotlib +import matplotlib.pyplot as plt +import numpy as np +import librosa +import librosa.display +import argparse + +parser = argparse.ArgumentParser(description="Create spectrogram from a wav file") +parser.add_argument("file", type=str, help="Path to the file (wav)") +parser.add_argument("mel", type=str, nargs='?', default="false", help="Should this spectrogram be rendered as a mel spectrogram? (true/false)") +parser.add_argument("output", default="", nargs='?', type=str, help="Path to output file (png)") +args = parser.parse_args() +audio, sampleRate = librosa.load(args.file) +plt.figure(figsize=(10,4)) + +mel = True if args.mel.lower() in ("yes", "true", "y", "t", "1") else False +spectrogram = None +if mel is True: + spectrogram = librosa.feature.melspectrogram(audio, sr=sampleRate) +else: + spectrogram = librosa.stft(audio) +librosa.display.specshow(librosa.power_to_db(np.abs(spectrogram), ref=np.max), y_axis='mel', fmax=sampleRate, x_axis='time') +plt.colorbar(format='%+2.0f dB') +plt.title('Spectrogram') +plt.tight_layout() +if args.output is not "": + plt.savefig(args.output) +else: + plt.show() diff --git a/misc/test_tensorflow.py b/misc/test_tensorflow.py new file mode 100644 index 0000000..32b3b07 --- /dev/null +++ b/misc/test_tensorflow.py @@ -0,0 +1,3 @@ +# Test whether tensorflow has picked up our GPU. +from tensorflow.python.client import device_lib +print(device_lib.list_local_devices()) diff --git a/model.py b/model.py new file mode 100644 index 0000000..59d56d7 --- /dev/null +++ b/model.py @@ -0,0 +1,107 @@ +import os +import sys +import numpy as np +import math +import keras +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Activation +from dataset import Dataset + +# Class to manage the model, it's state. +class Model: + def __init__(self, logger, config, dataset=None, validation_data=None): + self.logger = logger + self.config = config + self.model = None + self.dataset = dataset + self.validation_data = validation_data + + # Build a model + # If you want to experiment, this is the place you want to make the changes in + def build(self, output_summary=False): + self.logger.info("Building the model...") + model = Sequential() + + bins = math.ceil(self.config.getint("song", "window_size")/2)+1 + model.add(Conv2D(32, (3,3), padding="same", input_shape=(bins, self.config.getint("song", "sample_length"), 1), activation='relu')) + model.add(Conv2D(32, (3,3), padding="same", activation='relu')) + model.add(MaxPooling2D(pool_size=(2,2))) + model.add(Dropout(0.25)) + model.add(Conv2D(64, (3,3), padding="same", activation='relu')) + model.add(Conv2D(64, (3,3), padding="same", activation='relu')) + model.add(MaxPooling2D(pool_size=(2,2))) + + model.add(Dropout(0.5)) + model.add(Flatten()) + model.add(Dense(512)) + model.add(Activation('elu')) + model.add(Dropout(0.5)) + model.add(Dense(bins, activation='sigmoid')) + model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy']) + if output_summary is True: + model.summary() + self.model = model + + def train(self, epochs, batch=32, save_log=False, log_name="history.csv"): + if self.model is not None: + self.logger.info("Training the model...") + self.logger.info("Beggining training with %i samples.", len(self.dataset.mixture_windows)) + weights_backup = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=5) + #TODO: Evaluate the use of fit_generator or train_on_batch to load data from disk instead of storing it all in RAM since it takes up a lot of memory otherwise. + training = self.model.fit(self.dataset.mixture_windows, self.dataset.labels, batch_size=batch, epochs=epochs, validation_data=(self.validation_data.mixture_windows, self.validation_data.labels), callbacks=[weights_backup]) + self.logger.info("Training finished.") + if save_log is True: + self.logger.info("Exporting statistics.") + import csv + with open(log_name, 'w') as f: + w = csv.DictWriter(f, training.history.keys()) + w.writeheader() + w.writerow(training.history) + + def save(self, filename='obj.save'): + if self.model is not None: + self.model.save_weights(filename, overwrite=True) + else: + self.logger.critical("Cannot save weights - model not set up") + sys.exit(5) + + def load(self, filename='obj.save'): + if self.model is not None and os.path.isfile(filename): + self.model.load_weights(filename) + else: + self.logger.critical("Cannot load weights - model not set up or file not found") + sys.exit(3) + + def isolate(self, mixture, output="output.wav", save_accompaniment=True, save_original_mask=False, save_original_probabilities=False): + if self.model is not None: + #TODO: For some reason the output loses ~0.004s (3*BINS+1) worth of samples + self.logger.info("Preparing the song...") + split_x = np.array(mixture.split_slidingwindow(self.config.getint("song", "sample_length"))) + split_x = split_x.reshape(len(split_x), len(split_x[0]), len(split_x[0][0]), 1) + self.logger.info("Extracting vocals from the audio file...") + prediction = self.model.predict(split_x) + prediction = np.transpose(prediction) # Transpose the mask into the format librosa uses + if save_original_probabilities is True: + np.savetxt('original_predicted_probabilities.out', prediction) + self.logger.info("Calculating the binary mask...") + # Probability to label conversion, as there's no other way to get the output from the network in the right format + #TODO: don't fill the accompaniment array if save_accompaniment is set to False + accompaniment = np.zeros(np.shape(prediction)) + for x in range(0, len(prediction)): + for y in range(0, len(prediction[x])): + prediction[x][y] = 1 if prediction[x][y] > 0.45 else 0 # Higher values tend to make voice unintelligible. + accompaniment[x][y] = 0 if prediction[x][y] > 0.45 else 1 + if save_original_mask is True: + np.savetxt('predicted_mask.out', prediction) + if save_accompaniment is True: + spectrogram_bak = mixture.get_spectrogram() + mixture.apply_binary_mask(accompaniment) + mixture.reverse_stft() + mixture.save_file("instrumental_"+output) + mixture.set_spectrogram(spectrogram_bak) + mixture.apply_binary_mask(prediction) + mixture.reverse_stft() + mixture.save_file(output) + else: + self.logger.critical("Model not set up, cannot attempt to isolate.") + sys.exit(4) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..203642f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,62 @@ +absl-py==0.7.1 +asn1crypto==0.24.0 +astor==0.7.1 +attrs==19.1.0 +audioread==2.1.6 +bcrypt==3.1.6 +cairocffi==1.0.2 +cffi==1.12.2 +cryptography==2.6.1 +cycler==0.10.0 +decorator==4.4.0 +gast==0.2.2 +grpcio==1.19.0 +h5py==2.9.0 +Jinja2==2.10.1 +joblib==0.13.2 +jsonschema==3.0.1 +Keras==2.2.4 +Keras-Applications==1.0.7 +Keras-Preprocessing==1.0.9 +kiwisolver==1.0.1 +librosa==0.6.3 +llvmlite==0.28.0 +Markdown==3.1 +MarkupSafe==1.1.1 +matplotlib==3.0.3 +mock==2.0.0 +musdb==0.2.3 +museval==0.2.0 +numba==0.43.1 +numpy==1.16.2 +pandas==0.24.2 +paramiko==2.4.2 +pbr==5.1.3 +protobuf==3.7.1 +pyaml==19.4.1 +pyasn1==0.4.5 +pycairo==1.18.0 +pycparser==2.19 +PyNaCl==1.3.0 +pyparsing==2.4.0 +PyQt5==5.12.2 +PyQt5-sip==4.19.17 +pyrsistent==0.15.1 +python-dateutil==2.8.0 +pytz==2019.1 +PyYAML==5.1 +resampy==0.2.1 +scikit-learn==0.20.3 +scipy==1.2.1 +seaborn==0.9.0 +simplejson==3.16.0 +six==1.12.0 +SoundFile==0.10.2 +stempeg==0.1.6 +tensorboard==1.13.1 +tensorflow-estimator==1.13.0 +tensorflow-gpu==1.13.1 +termcolor==1.1.0 +tqdm==4.31.1 +virtualenv==16.4.3 +Werkzeug==0.15.2 diff --git a/song.py b/song.py new file mode 100644 index 0000000..21a2402 --- /dev/null +++ b/song.py @@ -0,0 +1,119 @@ +import librosa +import numpy as np +import math +import os + +# Song: Holds an information about a particular sound file and functions for modifying the raw sound data +# TODO: To decrease memory usage don't maintain all this data in RAM. Simply generate stfts, export them to a file and reuse them in the future. +class Song: + def __init__(self, logger, name, config): + self.logger=logger + self.config=config + self.name=name + self.type=None + self.amplitude=None + self.spectrogram=None + + # Load a file and resample it if necessary. This is a good idea if the data is inconsistent or has more samples than we need (22kHz is more than enough) + # Resampling is really slow though. You're better off resampling your audio manually beforehand. + def load_file(self, filename): + self.type=os.path.splitext(os.path.basename(filename))[0] + self.logger.debug("Loading file %s of type %s", filename, self.type) + self.data, _ = librosa.load(filename, sr=self.config.getint("song", "sample_size"), mono=True) + self.logger.debug("File loaded.") + + def save_file(self, filename): + #TODO: Don't save as a 32bit float since librosa can't load it afterwards + self.logger.info("Saving audio data to %s", filename) + librosa.output.write_wav(filename, self.data, self.config.getint("song", "sample_size"), norm=True) + + def dump_amplitude(self, note=""): + if self.amplitude is not None: + np.savetxt(self.name + '-' + self.type + '-' + note + '-amplitude.out', self.amplitude) + + def dump_spectrogram(self, note=""): + if self.spectrogram is not None: + np.savetxt(self.name + '-' + self.type + '-' + note + '-spectrogram.out', self.spectrogram) + + def get_spectrogram(self): + return self.spectrogram + def set_spectrogram(self, spectrogram): + self.spectrogram = spectrogram + + def get_name(self): + return self.name + def get_raw_data(self): + return self.data + + # Computes the short-term fourier transform and generates amplitude of the signals that the network can train on + def compute_stft(self, keep_spectrogram=False, keep_data=False): + if self.data is not None: + self.logger.debug("Generating sftf for %s", self.name) + spectrogram = librosa.stft(self.data, self.config.getint("song", "window_size"), hop_length=self.config.getint("song", "hop_length")) + self.amplitude = librosa.power_to_db(np.abs(spectrogram)**2) + if keep_spectrogram is True: + self.spectrogram = spectrogram + self.data = None + else: + self.logger.critical("Tried to generate stft for %s when the file wasn't loaded.", self.name) + sys.exit(6) + + # Split the spectrogram into smaller blocks so that our network can work with it + def split_spectrogram(self, length=25): + # Each frequency bin (defined by window size) contains data for that particular + # frequency over time - [frequency][time]. + # To make the data more paletable for the neural network, we need to split it. + # Each time entry corresponds to hop_size/sample_rate (i.e. 5ms @ 44100 Hz with hop size 256) + # We only need to predict the middle bin, the rest are there for context + # FIXME: This loses a few ms of data from the input audio since it rounds down. + # TODO: Save the spectrogram data to a file to avoid needing to generate and store it in RAM it every time + slices = [] + for x in range (0, self.amplitude.shape[1] // length): + _slice = self.amplitude[:,x * length : (x + 1) * length] + slices.append(_slice) + return slices + + def split_slidingwindow(self, length=25): + # Similar to the previous function but we create a sliding window for each + # bin. We do this only when predicting for a real song because of the memory requirements. + height = self.amplitude.shape[0] + # Pad the dataset + amplitude = self.amplitude + amplitude = np.column_stack((np.zeros((height, math.floor(length/2))), amplitude)) + amplitude = np.column_stack((amplitude, np.zeros((height, math.floor(length/2))))) + slices = [] + for x in range(math.floor(length/2), amplitude.shape[1] - math.floor(length/2)): + length_before = x - math.floor(length/2) + length_after = x + math.floor(length/2) + slices.append(np.array(amplitude[:, length_before : (length_after + 1)])) + return slices + + def get_labels(self, length=25): + # The labels contain the value of the middle slice of each time container + # in each frequency. The network understands which slice to target eventually. + # TODO: Maybe generate binary masks with librosa's softmask instead? + slices = [] + for x in range(0, self.amplitude.shape[1] // length): + _slice = [] + for y in range(0, self.amplitude.shape[0]): + # Mark whether there's voice acitivity in the given freq. in the given time bin + # The vocals might actually have some activity that equal silence but isn't a 0 in the original mix + # so we have to filter these out accordingly. + # NOTE: This _might_ clean out whispering and such. Test with your data set. + if self.amplitude[y,x*length+(math.ceil(length/2) if length > 1 else 0)] > 1: + _slice.append(1) + else: + _slice.append(0) + slices.append(_slice) + return slices + + # Apply network predictions and get useable output + def apply_binary_mask(self, mask): + self.spectrogram = np.multiply(self.spectrogram, mask) + + def reverse_stft(self): + if self.amplitude is not None: + self.data = librosa.istft(self.spectrogram, self.config.getint("song", "hop_length"), self.config.getint("song", "window_size")) + else: + self.logger.critical("Cannot find a STFT spectrogram to reverse - was it not generated?") + sys.exit(7) diff --git a/structure.md b/structure.md new file mode 100644 index 0000000..8b31bf2 --- /dev/null +++ b/structure.md @@ -0,0 +1,39 @@ +Training dataset: +``` +- dataset + - songname1 + - mixture.wav + - vocals.wav + - songname2 + - mixture.wav + - vocals.wav + ... + +- dataset-validation + - songname1 + - mixture.wav + - vocals.wav + - songname2 + - mixture.wav + - vocals.wav + ... +``` + +Separation: +You can specify any song with the `--file` option + +Evaluation: +``` +- evaluation + - songname1 + - vocals.wav + - accompaniment.wav + - estimated-vocals.wav + - estimated-accompaniment.wav + - songname2 + - vocals.wav + - accompaniment.wav + - estimated-vocals.wav + - estimated-accompaniment.wav + ... +```