-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
367 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
import csv | ||
import json | ||
|
||
|
||
class Analyzer(): | ||
""" | ||
This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. | ||
""" | ||
celex: str # declare celex as a string | ||
|
||
def __init__(self, celex): # Initialize Celex id as a constructor , passed when calling the class | ||
self.celex = celex | ||
|
||
def html_page_structure_one(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested | ||
table structure . The relevant text lies inside the coj-bold class of the span tag. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
div = parser.find_all('table') # Find all tables tag from the website | ||
one = [] | ||
for divs in div: | ||
# Find each nested table within the table | ||
table = divs.find('table') | ||
if table != None: | ||
# Find all p under the nested table with the coj-normal class | ||
p = table.find_all('p', class_="coj-normal") | ||
for x in p: | ||
# Span class of coj-bold under the p tag | ||
span = x.find_all('span', class_="coj-bold") | ||
for y in span: | ||
if x != None and y != None: | ||
|
||
# append text from span onto a list | ||
one.append(y.text) | ||
return one | ||
|
||
def html_page_structure_two(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph | ||
(p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
p = parser.find_all('p') | ||
two = [] | ||
for para in p: | ||
|
||
span = para.find('span') | ||
if span != None: | ||
|
||
if "operative" in span.text.lower(): | ||
normal = span.find_all_next('p', class_="normal") | ||
for op in normal: | ||
|
||
two.append(op.text) | ||
return two | ||
|
||
def structure_three(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested | ||
table structure . The relevant text lies inside the coj-bold class of the span tag. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
table = parser.find_all('table') | ||
three = [] | ||
for tables in table: | ||
interior = tables.find_all('table') | ||
for interiors in interior: | ||
if interiors != None: | ||
p = interiors.find_all('p', class_="coj-normal") | ||
for x in p: | ||
span = x.find_all('span', class_="coj-bold") | ||
for y in span: | ||
if x != None and y != None: | ||
|
||
three.append(y.text) | ||
return three | ||
|
||
def structure_four(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph | ||
(p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
p = parser.find_all('p') | ||
four = [] | ||
for para in p: | ||
|
||
span = para.find('span') | ||
if span != None: | ||
|
||
if "operative" in span.text.lower(): | ||
normal = span.find_all_next('table') | ||
for op in normal: | ||
tbody = op.find('tbody') | ||
new_p = tbody.find_all('p', class_="oj-normal") | ||
|
||
for subsequent in new_p: | ||
if subsequent != None: | ||
|
||
four.append(subsequent.text) | ||
|
||
return four | ||
|
||
def structure_five(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph | ||
(p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
p = parser.find_all('p') | ||
five = [] | ||
for para in p: | ||
|
||
span = para.find('span') | ||
if span != None: | ||
|
||
if "operative" in span.text.lower(): | ||
normal = span.find_all_next('table') | ||
for op in normal: | ||
tbody = op.find('tbody') | ||
new_p = tbody.find_all('p', class_="normal") | ||
|
||
for subsequent in new_p: | ||
if subsequent != None: | ||
|
||
five.append(subsequent.text) | ||
|
||
return five | ||
|
||
def structure_six(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 | ||
(header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. | ||
""" | ||
|
||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
div = parser.find_all('h2') | ||
six = [] | ||
for h2 in div: | ||
# print(h2.text) | ||
if h2.text == "Operative part": | ||
operatives = h2.find_all_next('p') | ||
for operative in operatives: | ||
|
||
six.append(operative.text) | ||
return six | ||
|
||
def structure_seven(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table | ||
(table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
div = parser.find_all('table') | ||
seven = [] | ||
for divs in div: | ||
# find tbody within the table | ||
table = divs.find_all('tbody') | ||
for tables in table: | ||
if tables != None: | ||
# find tr within the tbody | ||
p = tables.find_all('tr') | ||
for x in p: | ||
if x != None: | ||
# find td within the tr | ||
td = x.find_all('td') | ||
for y in td: | ||
if y != None: | ||
p = y.find_all('p', class_="normal") | ||
for all in p: | ||
if all != None: | ||
# find operative part within the span | ||
span = all.find_all( | ||
'span', class_="bold") | ||
for spans in span: | ||
# APpend it into a list and return the list when the function is called | ||
seven.append(spans.text) | ||
return seven | ||
|
||
def structure_eight(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside | ||
the tbody tag.Returns a list as output. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
|
||
tbody = parser.find_all('tbody') | ||
eight = [] | ||
for all in tbody: | ||
if all != None: | ||
tr = all.find_all('tr') | ||
for trs in tr: | ||
if trs != None: | ||
|
||
p = parser.find_all('p', class_="normal") | ||
for paras in p: | ||
if paras != None: | ||
if "on those grounds" in paras.text.lower(): | ||
|
||
span = paras.find_all_next( | ||
'span', class_="bold") | ||
for spans in span: | ||
if spans != None: | ||
eight.append(spans.text) | ||
|
||
return eight | ||
|
||
def structure_nine(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) | ||
tag after the p tag where the keywords "on those grounds" exist. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
nine = [] | ||
div = parser.find_all('p') | ||
for divs in div: | ||
if divs != None: | ||
if "on those grounds" in divs.text.lower(): | ||
b = divs.find_all_next('b') | ||
for bolds in b: | ||
# print(bolds.text) | ||
nine.append(bolds.text) | ||
return nine | ||
|
||
def structure_eleven(self) -> list: | ||
""" | ||
This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) | ||
tag after the b tag where the keywords "operative part" exist. | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
bold = parser.find_all('b') | ||
|
||
eleven = [] | ||
|
||
for b in bold: | ||
if b != None: | ||
if "operative part" in b.text.lower(): | ||
table = b.find_all_next('p') | ||
for tables in table: | ||
if tables != None: | ||
eleven.append(tables.text) | ||
|
||
return eleven | ||
|
||
def structure_ten(self): | ||
""" | ||
This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s | ||
server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase | ||
"On those grounds". | ||
""" | ||
website = requests.get( | ||
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text | ||
parser = BeautifulSoup(website, 'lxml') | ||
appender = [] | ||
for string in parser.stripped_strings: | ||
|
||
appender.append(string) | ||
|
||
found = False | ||
afterGrounds = [] | ||
|
||
for x in appender: | ||
|
||
if "on those grounds" in x.lower(): | ||
found = True | ||
|
||
if found: | ||
if len(x.split(" ")) > 3: | ||
afterGrounds.append(x) | ||
return afterGrounds | ||
|
||
def __call__(self) -> list: | ||
""" | ||
This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . | ||
""" | ||
|
||
container = [self.html_page_structure_one(), self.html_page_structure_two(), self.structure_three(), self.structure_four(), self.structure_five(), | ||
self.structure_six(), self.structure_seven(), self.structure_eight(), self.structure_nine(), self.structure_ten(), self.structure_eleven()] | ||
|
||
one: list | ||
for funcs in range(len(container)): | ||
|
||
one = container[funcs] | ||
|
||
if one: | ||
if (len(one) != 0 or one[0] != "\n"): | ||
print("here") | ||
return one | ||
|
||
|
||
|
||
|
||
class Writing(): | ||
""" | ||
This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) | ||
""" | ||
|
||
instance: str | ||
x: str | ||
parameter: str | ||
|
||
def __init__(self, celex: str): | ||
self.celex = celex | ||
self.instance = Analyzer(self.celex) | ||
self.x = self.instance() | ||
|
||
def to_csv(self): | ||
file = open("csv/output.csv", "a+") | ||
writer = csv.writer(file) | ||
|
||
if self.x != None: | ||
writer.writerow([self.celex, self.x]) | ||
|
||
def to_json(self): | ||
if self.x != None: | ||
data = {'Celex': self.celex, "Operative part": self.x} | ||
file = open('json/data.json', 'a+') | ||
json.dump(data, file) | ||
file.close() | ||
|
||
def to_txt(self): | ||
|
||
if self.x != None: | ||
file = open(f"txt/{self.celex}.txt", "a") | ||
for w in self.x: | ||
|
||
file.write(w+"\n") | ||
file.close() | ||
# Sample code for reading celex id's froma tsv file | ||
|
||
|
||
file = open("gijs_202310_node_list.tsv", "r") | ||
reader = csv.reader(file) | ||
testing = [] | ||
for row in reader: | ||
for rows in row: | ||
if "Id" not in rows: | ||
testing.append(rows.split("\t")[0]) | ||
for all in testing: | ||
instance = Writing(all) | ||
instance.to_csv() | ||
print(all) |