-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdf_to_audio.py
175 lines (131 loc) · 6.34 KB
/
pdf_to_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# Importing the required libraries
import fitz
import json
import re
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import copy
import os
from google.cloud import texttospeech
from tqdm import tqdm
class pdf_to_audio_converter:
# Converting the pdf to text
def pdfToText(self, filename):
doc = fitz.open(filename)
allText = ''
# count = 1
pageNo = 1
for page in doc:
# sizes =[]
# texts = []
# bold = []
displayList = page.getDisplayList()
textPage = displayList.getTextPage()
getJsonStr = textPage.extractJSON()
getJson = json.loads(getJsonStr)
for block in getJson['blocks']:
for line in block['lines']:
allText = allText + '\n'
for span in line['spans']:
sentence = span['text']
# font_size = span['size']
# font = span['font'].lower()
if sentence.replace(" ", "").isdigit():
continue
# font_median = font_analysis(filename)
# if font_size > 20 and 'bold' in font:
# allText += f'Title {count}' + '\n' + sentence
# count += 1
# continue
allText += sentence
pageNo += 1
return allText
def text_to_audio(self):
# we don't want a full GUI, so keep the root window from appearing
Tk().withdraw()
# open the dialog GUI to Select the directory location
filelocation = askopenfilename()
# Converting the pdf to text(str)
allText = self.pdfToText(filelocation)
# Importing Google Vision private key file for the service
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/hiteshsaaimanancherypanneerselvam/Documents/Development_Projects/pdf_to_audiobook/My First Project-1f1b9b4bc545.json"
if len(allText) >= 4800:
#calculating the iteration for pdf text by 4800 (as google vision can take only 5000 per request)
tot_iteration, tot_division = len(allText)//4800, len(allText)/4800
# Checking if partition of the has to be done
start = 0
end = 4800
for countOfPart in range(1, tot_iteration+1):
globals()['part%s' % countOfPart] = allText[start:end]
countOfPart += 1
start = copy.deepcopy(end)
end = end + 4800
if len(allText)%4800 != 0:
leftover = int(4800*(tot_division - tot_iteration))
final_end = start + leftover
globals()['part%s' % countOfPart] = allText[start: end]
# Instantiates a client
client = texttospeech.TextToSpeechClient()
for part in tqdm(range(1, countOfPart+1)):
# Set the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(text= globals()['part%s' % part])
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
file_name = filelocation.split('/')[-1].split('.')[0]
folder_name = f'{file_name}_audio_folder'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# The response's audio_content is binary.
with open(f'{folder_name}/part_{part}.mp3', "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print(f'Audio content written to file {folder_name}/part_{part}.mp3')
else:
# Instantiates a client
client = texttospeech.TextToSpeechClient()
# Set the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(text= allText)
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
file_name = filelocation.split('/')[-1].split('.')[0]
folder_name = f'{file_name}_audio_folder'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# The response's audio_content is binary.
with open(f'{folder_name}/{file_name}_audio.mp3', "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print(f'Audio content written to file {folder_name}/{file_name}_audio.mp3')
if __name__ == "__main__":
try:
pdf_to_audio_converter().text_to_audio()
print("")
print("Your Audio book is ready!!")
print("Enjoy Listening")
print("/n")
except Exception as e:
print(e)