-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencode_audio.py
163 lines (101 loc) · 5.05 KB
/
encode_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import sys,os
sys.path.insert(1, os.path.join(sys.path[0], 'helpers'))
import numpy as np
import argparse
from tools import *
DATA_DIR = ''
N_CONTEXT = 60 # Number of context: Total of how many pieces are seen before and after, when it is 60, 30 before and after
FEATURES = "MFCC+Pros"
N_INPUT = 30 # Total number of features
SILENCE_PATH = "helpers/silence.wav"
def pad_sequence(input_vectors):
"""
Pad array of features in order to be able to take context at each time-frame
We pad N_CONTEXT / 2 frames before and after the signal by the features of the silence
Args:
input_vectors: feature vectors for an audio
Returns:
new_input_vectors: padded feature vectors
"""
if FEATURES == "MFCC":
# Pad sequence not with zeros but with MFCC of the silence
silence_vectors = calculate_mfcc(SILENCE_PATH)
mfcc_empty_vector = silence_vectors[0]
empty_vectors = np.array([mfcc_empty_vector] * int(N_CONTEXT / 2))
if FEATURES == "Pros":
# Pad sequence with zeros
prosodic_empty_vector =[0, 0, 0, 0]
empty_vectors = np.array([prosodic_empty_vector] * int(N_CONTEXT / 2))
if FEATURES == "MFCC+Pros":
silence_vectors = calculate_mfcc(SILENCE_PATH) #
mfcc_empty_vector = silence_vectors[0]
prosodic_empty_vector = [0, 0, 0, 0]
combined_empty_vector = np.concatenate((mfcc_empty_vector, prosodic_empty_vector))
empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
if FEATURES == "Spectro":
silence_spectro = calculate_spectrogram(SILENCE_PATH)
spectro_empty_vector = silence_spectro[0]
empty_vectors = np.array([spectro_empty_vector] * int(N_CONTEXT / 2))
if FEATURES == "Spectro+Pros":
silence_spectro = calculate_spectrogram(SILENCE_PATH)
spectro_empty_vector = silence_spectro[0]
prosodic_empty_vector = [0, 0, 0, 0]
combined_empty_vector = np.concatenate((spectro_empty_vector, prosodic_empty_vector))
empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
# append N_CONTEXT/2 "empty" mfcc vectors to past
new_input_vectors = np.append(empty_vectors, input_vectors, axis=0)
# append N_CONTEXT/2 "empty" mfcc vectors to future
new_input_vectors = np.append(new_input_vectors, empty_vectors, axis=0)
return new_input_vectors
def create_vectors(audio_filename):
"""
Extract features from a given pair of audio and motion files
Args:
audio_filename: file name for an audio file (.wav)
Returns:
input_with_context : speech features
output_with_context : motion features
"""
visualize=False
# Step 1: Vactorizing speech, with features of N_INPUT dimension, time steps of 0.01s
# and window length with 0.025s => results in an array of 100 x N_INPUT
if FEATURES == "MFCC":
input_vectors = calculate_mfcc(audio_filename)
elif FEATURES == "Pros":
input_vectors = extract_prosodic_features(audio_filename)
elif FEATURES == "MFCC+Pros":
mfcc_vectors = calculate_mfcc(audio_filename)
pros_vectors = extract_prosodic_features(audio_filename)
mfcc_vectors, pros_vectors = shorten(mfcc_vectors, pros_vectors)
input_vectors = np.concatenate((mfcc_vectors, pros_vectors), axis=1)
elif FEATURES =="Spectro":
input_vectors = calculate_spectrogram(audio_filename)
elif FEATURES == "Spectro+Pros":
spectr_vectors = calculate_spectrogram(audio_filename)
pros_vectors = extract_prosodic_features(audio_filename)
spectr_vectors, pros_vectors = shorten(spectr_vectors, pros_vectors)
input_vectors = np.concatenate((spectr_vectors, pros_vectors), axis=1)
# Step 4: Retrieve N_CONTEXT each time, stride one by one
input_with_context = np.array([])
strides = len(input_vectors)
input_vectors = pad_sequence(input_vectors)
for i in range(strides):
stride = i + int(N_CONTEXT/2)
if i == 0:
input_with_context = input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT)
else:
input_with_context = np.append(input_with_context, input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT), axis=0)
return input_with_context
def encode_and_save(audio_input_file, output_file):
audio_encoded = create_vectors(audio_input_file)
np.save(output_file, audio_encoded)
if __name__ == '__main__':
# Parse command line params
parser = argparse.ArgumentParser(
description='Encode an audio file and save it into numpy format')
parser.add_argument('--input_audio', '-i', default='data/audio_segment.wav',
help='Path to the input file with the motion')
parser.add_argument('--output_file', '-o', default='data/audio_encoded.npy',
help='Path to the output file with the video')
args = parser.parse_args()
encode_and_save(args.input_audio, args.output_file)