-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added only necessary files and edited ignore
- Loading branch information
nlght
committed
May 9, 2019
1 parent
372ac0a
commit 0e81f17
Showing
27 changed files
with
754 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
/data/ | ||
/.idea/ | ||
/__pycache__/ | ||
*.npy | ||
*.tar.gz |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
import numpy | ||
import scipy.io.wavfile | ||
import matplotlib.pyplot as plt | ||
#from scipy.fftpack import dct | ||
from mfcc_bro import do_mfcc | ||
|
||
def gimmeDaSPECtogram(input, window_size_ms=30.0, stride_ms=10.0, pre_emphasis=0.97, NFFT=512, triangular_filters=40, magnitude_squared=False, name=None): | ||
#print(input) | ||
sample_rate, signal = scipy.io.wavfile.read(input) # File assumed to be in the same directory | ||
#print(sample_rate) | ||
signal = signal[0:int(1.0 * sample_rate)] # Keep the first 3.5 seconds | ||
paddedSignal = numpy.repeat(numpy.mean(signal[0:500]), 16000 - signal.shape[0]) | ||
numpy.append(signal, paddedSignal) | ||
window_size_ms = window_size_ms/1000 | ||
stride_ms = stride_ms/1000 | ||
|
||
#ifitspadded = False | ||
|
||
|
||
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1]) | ||
#emphasized_signal = signal | ||
if emphasized_signal.shape[0] > 16000: | ||
emphasized_signal = emphasized_signal[0:int(1.0 * sample_rate)] | ||
elif emphasized_signal.shape[0] < 16000: | ||
mean = numpy.mean(numpy.abs(emphasized_signal)) | ||
while emphasized_signal.shape[0] < 16000: | ||
i = 0 | ||
|
||
#print(mean) | ||
last_value = 0 | ||
for value in emphasized_signal[0:(16000-emphasized_signal.shape[0])]: | ||
if i > 0 and numpy.abs(value) - numpy.abs(last_value) > 200: | ||
break | ||
last_value = value | ||
i += 1 | ||
|
||
#paddedSignal = numpy.repeat(numpy.mean(emphasized_signal[0:500]), 16000 - signal.shape[0]) | ||
paddedSignal = emphasized_signal[0:i] | ||
emphasized_signal = numpy.append(emphasized_signal, paddedSignal) | ||
#print(emphasized_signal.shape) | ||
#while emphasized_signal.shape[0] < 16000: | ||
# distanceToEnd = 16000-emphasized_signal.shape[0] | ||
# emphasized_signal = emphasized_signal[:emphasized_signal,] | ||
# #numpy.append(emphasized_signal, [0], 0) | ||
# print(emphasized_signal.shape) | ||
window_size_ms, stride_ms = window_size_ms * sample_rate, stride_ms * sample_rate # Convert from seconds to samples | ||
signal_length = len(emphasized_signal) | ||
window_size_ms = int(round(window_size_ms)) | ||
stride_ms = int(round(stride_ms)) | ||
num_frames = int(numpy.ceil( | ||
float(numpy.abs(signal_length - window_size_ms)) / stride_ms)) # Make sure that we have at least 1 frame | ||
|
||
#print(sample_rate) | ||
""" FIXED | ||
print(len(signal)) #16k ofc | ||
print(sample_rate) #16k ofc | ||
print(len(emphasized_signal)) #16k ofc | ||
print(frame_length) #480k, wth? | ||
print(frame_step) #160k | ||
print(num_frames) #3 motherfucking frames bois, is it frames per window? | ||
""" | ||
|
||
|
||
pad_signal_length = num_frames * stride_ms + window_size_ms | ||
z = numpy.zeros((pad_signal_length - signal_length)) | ||
pad_signal = numpy.append(emphasized_signal, | ||
z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal | ||
|
||
indices = numpy.tile(numpy.arange(0, window_size_ms), (num_frames, 1)) + numpy.tile( | ||
numpy.arange(0, num_frames * stride_ms, stride_ms), (window_size_ms, 1)).T | ||
frames = pad_signal[indices.astype(numpy.int32, copy=False)] #cast the array to be of type int32. | ||
|
||
frames *= numpy.hamming(window_size_ms) | ||
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation ** | ||
|
||
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT | ||
#plt.plot(mag_frames) | ||
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum | ||
#plt.plot(pow_frames) | ||
|
||
low_freq_mel = 0 | ||
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 4) / 700)) #Ask liming shi #Why is this shit divided by 2? huh? is it because it's half of 8k that they are using? do we need to divide it by 4 then? # Convert Hz to Mel | ||
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, triangular_filters + 2) # Equally spaced in Mel scale | ||
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz | ||
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate) | ||
|
||
fbank = numpy.zeros((triangular_filters, int(numpy.floor(NFFT / 2 + 1)))) | ||
for m in range(1, triangular_filters + 1): | ||
f_m_minus = int(bin[m - 1]) # left | ||
f_m = int(bin[m]) # center | ||
f_m_plus = int(bin[m + 1]) # right | ||
|
||
for k in range(f_m_minus, f_m): | ||
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) | ||
for k in range(f_m, f_m_plus): | ||
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) | ||
filter_banks = numpy.dot(pow_frames, fbank.T) | ||
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability | ||
|
||
filter_banks = 20 * numpy.log10(filter_banks) # dB | ||
|
||
|
||
|
||
|
||
""" | ||
filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) | ||
plt.imshow(filter_banks.T, cmap=plt.cm.jet, aspect='auto') | ||
plt.xticks(numpy.arange(0, (filter_banks.T).shape[1], | ||
int((filter_banks.T).shape[1] / 4)), | ||
['0s', '0.25s', '0.5s', '0.75s', '1s']) | ||
ax = plt.gca() | ||
ax.invert_yaxis() | ||
plt.title('the spectrum image') | ||
plt.show() | ||
""" | ||
|
||
""" | ||
plt.subplot(312) | ||
filter_banks = do_mfcc(filter_banks, upper_frequency_limit=4000, lower_frequency_limit=0, dct_coefficient_count=12) | ||
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) | ||
plt.imshow(filter_banks.T, cmap=plt.cm.jet, aspect='auto') | ||
plt.xticks(numpy.arange(0, (filter_banks.T).shape[1], | ||
int((filter_banks.T).shape[1] / 4)), | ||
['0s', '0.25s', '0.5s', '0.75s', '1s']) | ||
plt.yticks(numpy.arange(1, (filter_banks.T).shape[0], | ||
int((filter_banks.T).shape[0] / 4)), | ||
['0', '3', '6', '9', '12']) | ||
ax = plt.gca() | ||
ax.invert_yaxis() | ||
plt.title('the mfcc image') | ||
plt.show() | ||
""" | ||
|
||
|
||
|
||
#plt.subplot(312) | ||
filter_banks = do_mfcc(filter_banks, upper_frequency_limit=4000, lower_frequency_limit=0, dct_coefficient_count=12) | ||
#print(filter_banks.shape) | ||
## filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) | ||
#plt.imshow(filter_banks.T, cmap=plt.cm.jet, aspect='auto') | ||
#plt.xticks(numpy.arange(0, (filter_banks.T).shape[1], | ||
# int((filter_banks.T).shape[1] / 4)), | ||
# ['0s', '0.25s', '0.5s', '0.75s', '1s']) | ||
#plt.yticks(numpy.arange(1, (filter_banks.T).shape[0], | ||
# int((filter_banks.T).shape[0] / 4)), | ||
# ['0', '3', '6', '9', '12']) | ||
#ax = plt.gca() | ||
#ax.invert_yaxis() | ||
#plt.show() | ||
|
||
mfccs_graph = filter_banks.T | ||
|
||
return mfccs_graph | ||
|
||
#plt.imshow(filter_banks) | ||
|
||
#plt.imshow(do_mfcc(filter_banks, upper_frequency_limit=4000, lower_frequency_limit=0, dct_coefficient_count=12)) | ||
|
||
#plt.show() | ||
|
||
#mfcc plot | ||
|
||
""" | ||
plt.subplot(312) | ||
filter_banks = do_mfcc(filter_banks, upper_frequency_limit=4000, lower_frequency_limit=0, dct_coefficient_count=12) | ||
#filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) | ||
plt.imshow(filter_banks.T, cmap=plt.cm.jet, aspect='auto') | ||
plt.xticks(numpy.arange(0, (filter_banks.T).shape[1], | ||
int((filter_banks.T).shape[1] / 4)), | ||
['0s', '0.25s', '0.5s', '0.75s', '1s']) | ||
plt.yticks(numpy.arange(1, (filter_banks.T).shape[0], | ||
int((filter_banks.T).shape[0]/4)), | ||
['0', '3', '6', '9', '12']) | ||
ax = plt.gca() | ||
ax.invert_yaxis() | ||
plt.title('the mfcc image') | ||
#Spectrum | ||
""" | ||
""" | ||
filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8) | ||
plt.imshow(filter_banks.T, cmap=plt.cm.jet, aspect='auto') | ||
plt.xticks(numpy.arange(0, (filter_banks.T).shape[1], | ||
int((filter_banks.T).shape[1] / 4)), | ||
['0s', '0.25s', '0.5s', '0.75s', '1s']) | ||
ax = plt.gca() | ||
ax.invert_yaxis() | ||
plt.title('the spectrum image') | ||
""" | ||
|
||
plt.show() | ||
|
||
|
||
|
||
|
||
#gimmeDaSPECtogram("samples/left.wav", window_size_ms=30.0, stride_ms=10.0, pre_emphasis=0.97) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
from preprocess import * | ||
import keras | ||
from keras.models import Sequential | ||
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D | ||
from keras.utils import to_categorical | ||
from keras import backend as K | ||
import tensorflow as tf | ||
from SPECtogram import gimmeDaSPECtogram | ||
|
||
# Second dimension of the feature is dim2 | ||
feature_dim_2 = 12 | ||
|
||
# Save data to array file first | ||
save_data_to_array(max_len=feature_dim_2) | ||
|
||
# # Loading train set and test set | ||
X_train, X_test, y_train, y_test = get_train_test() | ||
|
||
# # Feature dimension | ||
feature_dim_1 = 97 | ||
channel = 1 | ||
epochs = 50 | ||
batch_size = 100 | ||
verbose = 1 | ||
labels_local, _, _ = get_labels() | ||
num_classes = len(labels_local) | ||
|
||
# Reshaping to perform 2D convolution | ||
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel) | ||
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel) | ||
|
||
y_train_hot = to_categorical(y_train) | ||
y_test_hot = to_categorical(y_test) | ||
|
||
|
||
|
||
|
||
def get_model(): | ||
model = Sequential() | ||
model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel))) | ||
model.add(Conv2D(48, kernel_size=(2, 2), activation='relu')) | ||
model.add(Conv2D(120, kernel_size=(2, 2), activation='relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Dropout(0.25)) | ||
model.add(Flatten()) | ||
model.add(Dense(128, activation='relu')) | ||
model.add(Dropout(0.25)) | ||
model.add(Dense(64, activation='relu')) | ||
model.add(Dropout(0.4)) | ||
model.add(Dense(10, activation='softmax')) | ||
model.compile(loss=keras.losses.categorical_crossentropy, | ||
optimizer=keras.optimizers.Adadelta(), | ||
metrics=['accuracy']) | ||
return model | ||
|
||
# Predicts one sample | ||
def predict(filepath, model): | ||
sample = wav2mfcc(filepath, feature_dim_2) | ||
sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel) | ||
return get_labels()[0][ | ||
np.argmax(model.predict(sample_reshaped)) | ||
] | ||
|
||
|
||
model = get_model() | ||
model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, y_test_hot)) | ||
|
||
# serialize model to JSON | ||
model_json = model.to_json() | ||
with open("model_4.json", "w") as json_file: | ||
json_file.write(model_json) | ||
# serialize weights to HDF5 | ||
model.save_weights("model_4.h5") | ||
print("Saved model to disk") | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True): | ||
""" | ||
Freezes the state of a session into a pruned computation graph. | ||
Creates a new computation graph where variable nodes are replaced by | ||
constants taking their current value in the session. The new graph will be | ||
pruned so subgraphs that are not necessary to compute the requested | ||
outputs are removed. | ||
@param session The TensorFlow session to be frozen. | ||
@param keep_var_names A list of variable names that should not be frozen, | ||
or None to freeze all the variables in the graph. | ||
@param output_names Names of the relevant graph outputs. | ||
@param clear_devices Remove the device directives from the graph for better portability. | ||
@return The frozen graph definition. | ||
""" | ||
graph = session.graph | ||
with graph.as_default(): | ||
freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or [])) | ||
output_names = output_names or [] | ||
output_names += [v.op.name for v in tf.global_variables()] | ||
input_graph_def = graph.as_graph_def() | ||
if clear_devices: | ||
for node in input_graph_def.node: | ||
node.device = "" | ||
frozen_graph = tf.graph_util.convert_variables_to_constants( | ||
session, input_graph_def, output_names, freeze_var_names) | ||
return frozen_graph | ||
|
||
|
||
|
||
frozen_graph = freeze_session(K.get_session(), output_names=[out.op.name for out in model.outputs]) | ||
|
||
tf.train.write_graph(frozen_graph, "/home/night/PycharmProjects/APMiniProject/", "my_model_4.pb", as_text=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import numpy | ||
from scipy.fftpack import dct | ||
|
||
def do_mfcc(spectrogram, upper_frequency_limit=4000, lower_frequency_limit=0, dct_coefficient_count=12): | ||
|
||
mfcc = dct(spectrogram, type=2, axis=1, norm='ortho')[:, 1: (dct_coefficient_count + 1)] # Keep 2-13 | ||
|
||
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8) # Mean normalization of mfcc | ||
|
||
|
||
(nframes, ncoeff) = mfcc.shape | ||
n = numpy.arange(ncoeff) | ||
lift = 1 + (22 / 2) * numpy.sin(numpy.pi * n / 22) #ceplifter – apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. | ||
mfcc *= lift | ||
|
||
return mfcc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "batch_input_shape": [null, 97, 12, 1], "dtype": "float32", "filters": 32, "kernel_size": [2, 2], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 48, "kernel_size": [2, 2], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv2D", "config": {"name": "conv2d_3", "trainable": true, "filters": 120, "kernel_size": [2, 2], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_1", "trainable": true, "pool_size": [2, 2], "padding": "valid", "strides": [2, 2], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 64, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "units": 10, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.4", "backend": "tensorflow"} |
Binary file not shown.
Oops, something went wrong.