Commit a1536602 authored by Alexander Fuchs's avatar Alexander Fuchs
Browse files

Added support for non-birdcall samples

parent 3404aa9e
......@@ -9,37 +9,47 @@ from utils.data_loader import Dataset
from utils.data_loader import DataGenerator
from models.network import Network
from models.res_block import ResBlockBasicLayer
#logging.set_verbosity(logging.WARNING)
def preprocess_input(sample):
def preprocess_input(sample,n_classes,is_training):
"""Preprocess a single image of layout [height, width, depth]."""
return sample
if tf.random.uniform([1])[0] > 1/n_classes or not(is_training):
input_features = tf.image.per_image_standardization(sample['input_features'])
labels = sample['labels']
else:
input_features = tf.image.per_image_standardization(sample['false_sample'])
labels = tf.cast(tf.one_hot(n_classes+1,n_classes+1),tf.int32)
return {'input_features':input_features,'labels':labels,'false_sample':sample['false_sample']}
def data_generator(data_generator,batch_size,is_training,
shuffle_buffer = 128,
is_validation=False,
n_classes = 10,
take_n=None,
skip_n=None):
dataset = tf.data.Dataset.from_generator(data_generator,
output_types = {'input_features':tf.float32,
'labels':tf.int32})
'labels':tf.int32,
'false_sample':tf.float32})
if skip_n != None:
dataset = dataset.skip(skip_n)
if take_n != None:
dataset = dataset.take(take_n)
if is_training:
dataset = dataset.shuffle(shuffle_buffer)
#dataset = dataset.map(preprocess_input)
dataset = dataset.map(lambda sample : preprocess_input(sample,n_classes,is_training))
dataset = dataset.batch(batch_size,drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
else:
#dataset = dataset.map(preprocess_input)
dataset = dataset.map(lambda sample : preprocess_input(sample,n_classes,is_training))
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
......@@ -47,11 +57,11 @@ def data_generator(data_generator,batch_size,is_training,
def learning_rate_fn(epoch):
if epoch >= 20 and epoch <30:
if epoch >= 150 and epoch <200:
return 0.1
elif epoch >=200 and epoch <250:
return 0.01
elif epoch >=30 and epoch <40:
return 0.001
elif epoch >=40:
elif epoch >=250:
return 0.001
else:
return 1.0
......@@ -60,12 +70,12 @@ def learning_rate_fn(epoch):
FLAGS = flags.FLAGS
flags.DEFINE_string('model_dir', '/tmp', 'save directory name')
flags.DEFINE_string('data_dir', '/tmp', 'data directory name')
flags.DEFINE_integer('epochs', 40, 'number of epochs')
flags.DEFINE_integer('batch_size', 16, 'Mini-batch size')
flags.DEFINE_integer('epochs', 300, 'number of epochs')
flags.DEFINE_integer('batch_size', 32, 'Mini-batch size')
flags.DEFINE_float('dropout_rate', 0.0, 'dropout rate for the dense blocks')
flags.DEFINE_float('weight_decay', 1e-4, 'weight decay parameter')
flags.DEFINE_float('learning_rate', 1e-3, 'learning rate')
flags.DEFINE_float('training_percentage', 80, 'Percentage of the training data used for training. (100-training_percentage is used as validation data.)')
flags.DEFINE_float('training_percentage', 90, 'Percentage of the training data used for training. (100-training_percentage is used as validation data.)')
flags.DEFINE_boolean('load_model', False, 'Bool indicating if the model should be loaded')
......@@ -96,7 +106,7 @@ def main(argv):
load_model = FLAGS.load_model
training_percentage = FLAGS.training_percentage
model_save_dir+="_dropout_rate_"+str(dropout_rate)+"_learning_rate_"+str(lr)+"_weight_decay_"+str(weight_decay)
model_save_dir+="_batch_size_"+str(batch_size)+"_dropout_rate_"+str(dropout_rate)+"_learning_rate_"+str(lr)+"_weight_decay_"+str(weight_decay)
ds_train = Dataset(data_dir,is_training_set = True)
n_total = ds_train.n_samples
......@@ -108,12 +118,12 @@ def main(argv):
model = Network(ResBlockBasicLayer,
n_blocks = 4,
n_layers = [2,2,2,2],
strides = [2,1,2,1],
strides = [2,2,2,2],
channel_base = [64,128,256,512],
n_classes = ds_train.n_classes,
n_classes = ds_train.n_classes+1,
init_ch = 64,
init_ksize = 7,
init_stride = 1,
init_stride = 2,
use_max_pool = True,
kernel_regularizer = tf.keras.regularizers.l2(2e-4),
kernel_initializer = tf.keras.initializers.he_normal(),
......@@ -122,11 +132,13 @@ def main(argv):
train_data_gen = data_generator(dg_train.generate,batch_size,
is_training=True,
shuffle_buffer = 256,
n_classes = ds_train.n_classes,
take_n=n_train)
val_data_gen = data_generator(dg_train.generate,10,
is_training=False,
is_validation = True,
n_classes = ds_train.n_classes,
skip_n=n_train,
take_n=n_val)
......@@ -142,7 +154,7 @@ def main(argv):
base_learning_rate = lr,
load_model = load_model,
save_dir = model_save_dir,
init_data = tf.random.normal([batch_size,1025,432,2]),
init_data = tf.random.normal([batch_size,1025,216,2]),
start_epoch = 0)
trainer.train()
......
......@@ -8,6 +8,7 @@ import warnings
import multiprocessing
import sys
import random
import copy
import tensorflow as tf
warnings.filterwarnings('ignore')
......@@ -23,7 +24,9 @@ class Dataset(object):
self.train_audio_path = os.path.join(path,"train_audio")
self.train_csv_path = os.path.join(path,"train.csv")
self.train_dict = self.csv_to_dict(self.train_csv_path)
#Path of "false" audio samples
self.false_audio_path = os.path.join(path,"false_audio")
#Path to test audio
self.test_audio_path = os.path.join(path,"example_test_audio")
self.test_csv_path = os.path.join(path,"test.csv")
self.test_dict = self.csv_to_dict(self.test_csv_path)
......@@ -132,14 +135,23 @@ class DataGenerator(object):
shuffle=True,
is_training=True,
force_feature_recalc=False,
max_time = 10,
max_time = 5,
max_samples_per_audio = 6,
n_fft = 2048,
hop_length = 512,
sampling_rate = 22050):
self.dataset = dataset
#Shuffle files before loading since dataset is ordered by class
if shuffle:
random.shuffle(self.dataset.train_samples)
self.augmentation = augmentation
self.shuffle = shuffle
self.is_training = is_training
self.sampling_rate = sampling_rate
self.n_fft = n_fft
self.hop_length = hop_length
self.max_time = max_time
self.max_samples_per_audio = max_samples_per_audio
self.force_feature_recalc = force_feature_recalc
def do_stft(self,y,channels):
......@@ -147,8 +159,8 @@ class DataGenerator(object):
#STFT for all channels
for channel in range(channels):
spectrum = np.abs(librosa.core.stft(y[channel,:],
n_fft = 2048,
hop_length = 512,
n_fft = self.n_fft,
hop_length = self.hop_length,
window = 'hann',
center = True))
spectrum = np.asarray(spectrum,dtype=np.float32)
......@@ -156,6 +168,24 @@ class DataGenerator(object):
spectra = np.stack(spectra,axis=0)
return spectra
def pad_sample(self,spectrum,x_size=np.ceil(5*22050/512)):
diff = int(x_size) - spectrum.shape[-1]
if diff == 0:
return spectrum
if diff > spectrum.shape[-1]:
while spectrum.shape[-1] < x_size:
spectrum = np.concatenate([spectrum,spectrum],axis=-1)
spectrum = spectrum[:,:,:int(x_size)]
else:
#First element is often zero. To avoid jump skip first element
if diff+1 < spectrum.shape[-1]:
spectrum = np.concatenate([spectrum,spectrum[:,:,1:diff+1]],axis=-1)
else:
spectrum = np.concatenate([spectrum,spectrum[:,:,:diff]],axis=-1)
return spectrum
def create_feature(self,sample):
"""Creates the features by doing a STFT"""
try:
......@@ -168,34 +198,31 @@ class DataGenerator(object):
mono = False
y, sr = librosa.core.load(filename,mono=mono,sr=self.sampling_rate)
if mono == 1:
y,_ = librosa.effects.trim(y)
if mono == True:
y = np.expand_dims(y,0)
duration = y.shape[-1]/self.sampling_rate
#If the sample is longer than two times the maximum audio length, the segments are split
if duration > 2*self.max_time:
y_begin = y[:,:int(self.sampling_rate*self.max_time/2)]
y_end = y[:,-int(self.sampling_rate*self.max_time/2):]
#Transform first and last part of audio
spectra_begin = self.do_stft(y_begin,channels)
spectra_end = self.do_stft(y_end,channels)
spectra = np.concatenate([spectra_begin,spectra_end],axis=-1)
n_samples = int(np.ceil(duration/self.max_time))
n_samples = min(n_samples,self.max_samples_per_audio)
spectra = {}
for i_sample in range(n_samples):
start = i_sample*int(self.sampling_rate*self.max_time)
end = (i_sample+1)*int(self.sampling_rate*self.max_time)
end = min(end,y.shape[-1])
y_sample = y[:,start:end]
if y_sample.shape[-1] == 1:
break
#Transform audio
spectrum = self.do_stft(y_sample,channels)
#Pad spectrum
spectrum = self.pad_sample(spectrum,
x_size=np.ceil(self.max_time*self.sampling_rate/self.hop_length))
spectra[str(i_sample)] = spectrum
if "mp3" in filename:
np.savez(filename.replace("mp3","npz"),spectra)
else:
#If sample is > max_time and < 2*max time take the beginning of the file
if duration >= self.max_time:
y = y[:,:int(self.sampling_rate*self.max_time)]
else:
#pad end with zeros
y_tmp = np.zeros((channels,int(self.sampling_rate*self.max_time)))
y_tmp[:,:len(y[0,:])] = y
y = y_tmp
spectra = self.do_stft(y,channels)
#Pad spectra to match the size of case duration > 2*max_time
spectra = np.concatenate([spectra,np.zeros((channels,spectra.shape[1],1))],axis=-1)
np.savez(filename.replace("mp3","npz"),spectra)
np.savez(filename.replace("wav","npz"),spectra)
except:
spectra = None
print(sample['filename']+" failed at feature extraction!")
......@@ -211,12 +238,14 @@ class DataGenerator(object):
n = len(samples)
ct = 0
tot_size = 0
for sample in samples:
spectra = self.create_feature(sample)
tot_size += spectra.size * spectra.itemsize/1e6
if np.any(spectra) == None:
print(sample["filename"]+" failed!")
else:
print("Calculated "+str(ct/n*100)+"% of samples...")
ct += 1
print("Calculated "+str(ct/n*100)+"% of samples, using "+str(tot_size)+" MB of disc space...")
def create_all_features_multi_cpu(self):
......@@ -225,10 +254,6 @@ class DataGenerator(object):
else:
all_samples = self.dataset.test_samples
mp3_filenames = glob.glob(self.dataset.train_audio_path + "/**/*",
recursive = True)
samples = []
for sample in all_samples:
filename = sample['filename']
......@@ -243,7 +268,33 @@ class DataGenerator(object):
for i, _ in enumerate(pool.imap_unordered(self.create_feature, samples), 1):
sys.stderr.write('\rdone {0:%}'.format(max(0,i/n)))
def create_false_features_multi_cpu(self):
filenames_mono = glob.glob(self.dataset.false_audio_path+ "/mono/*.wav",
recursive = True)
filenames_stereo = glob.glob(self.dataset.false_audio_path+ "/stereo/*.wav",
recursive = True)
samples = []
for filename in filenames_mono:
if not(os.path.isfile(filename.replace("wav","npz"))) or self.force_feature_recalc:
samples.append({'filename':filename,'channels':'1 mono'})
for filename in filenames_stereo:
if not(os.path.isfile(filename.replace("wav","npz"))) or self.force_feature_recalc:
samples.append({'filename':filename,'channels':'2 stereo'})
print(str(len(filenames_mono)+len(filenames_stereo)-len(samples))+" feature samples already exist")
n = len(samples)
pool = multiprocessing.Pool(os.cpu_count())
for i, _ in enumerate(pool.imap_unordered(self.create_feature, samples), 1):
sys.stderr.write('\rdone {0:%}'.format(max(0,i/n)))
def generate(self):
......@@ -251,31 +302,65 @@ class DataGenerator(object):
samples = self.dataset.train_samples
else:
samples = self.dataset.test_samples
#Shuffle files before loading since dataset is ordered by class
random.shuffle(samples)
#Get paths of false samples
false_samples_mono = glob.glob(self.dataset.false_audio_path+ "/mono/*.npz",
recursive = True)
false_samples_stereo = glob.glob(self.dataset.false_audio_path+ "/stereo/*.npz",
recursive = True)
false_samples = false_samples_mono + false_samples_stereo
stft_len = int(np.ceil(self.max_time*self.sampling_rate/self.hop_length))
for sample in samples:
filename = sample['filename']
#If feature was already created load from file
if os.path.isfile(filename.replace("mp3","npz")) and not(self.force_feature_recalc):
spectra_npz = np.load(filename.replace("mp3","npz"),allow_pickle=True)
spectra = spectra_npz.f.arr_0
else:
#Create features via STFT if no file exists
spectra = self.create_feature(sample)
#Check for None type
if np.any(spectra) == None:
try:
filename = sample['filename']
#If feature was already created load from file
if os.path.isfile(filename.replace("mp3","npz")) and not(self.force_feature_recalc):
spectra_npz = np.load(filename.replace("mp3","npz"),allow_pickle=True)
spec_keys = spectra_npz.f.arr_0.item().keys()
spec_keys = list(spec_keys)
rnd_key = spec_keys[np.random.randint(0,len(spec_keys))]
spectra = spectra_npz.f.arr_0.item()[rnd_key]
else:
#Create features via STFT if no file exists
spectra = self.create_feature(sample)
#Check for None type and shape
if np.any(spectra) == None or spectra.shape[-1] != stft_len:
continue
#Get false sample
rnd_false_sample = random.choice(false_samples)
false_spectra_npz = np.load(rnd_false_sample,allow_pickle=True)
false_spec_keys = false_spectra_npz.f.arr_0.item().keys()
false_spec_keys = list(false_spec_keys)
false_rnd_key = false_spec_keys[np.random.randint(0,len(false_spec_keys))]
false_spectra = false_spectra_npz.f.arr_0.item()[false_rnd_key]
#If only mono --> duplicate
if spectra.shape[0] == 1:
spectra = np.tile(spectra,[2,1,1])
#If false only mono --> duplicate
if false_spectra.shape[0] == 1:
false_spectra = np.tile(false_spectra,[2,1,1])
#Transpose spectrogramms for "channels_last"
spectra = tf.transpose(spectra,perm=[1,2,0])
false_spectra = tf.transpose(false_spectra,perm=[1,2,0])
yield {'input_features':spectra,
'labels':tf.one_hot(sample['bird_id'],self.dataset.n_classes+1),
'false_sample':false_spectra}
except:
continue
#If only mono --> duplicate
if spectra.shape[0] == 1:
spectra = np.tile(spectra,[2,1,1])
#Transpose spectrogramms for "channels_last"
spectra = tf.transpose(spectra,perm=[1,2,0])
yield {'input_features':spectra,'labels':tf.one_hot(sample['bird_id'],self.dataset.n_classes)}
if __name__ == "__main__":
ds = Dataset("/srv/TUG/datasets/birdsong-recognition")
dg = DataGenerator(ds,None)
ds = Dataset("/srv/TUG/datasets/cornell_birdcall_recognition")
dg = DataGenerator(ds,None,force_feature_recalc=True)
dg.create_all_features_multi_cpu()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment