Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
venvis authored Mar 21, 2024
1 parent 7cb83e4 commit d5e1aa3
Showing 1 changed file with 367 additions and 0 deletions.
367 changes: 367 additions & 0 deletions cellar/cellar_extractor/operative_extractions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@

import requests
from bs4 import BeautifulSoup

import csv
import json


class Analyzer():
"""
This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor.
"""
celex: str # declare celex as a string

def __init__(self, celex): # Initialize Celex id as a constructor , passed when calling the class
self.celex = celex

def html_page_structure_one(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested
table structure . The relevant text lies inside the coj-bold class of the span tag.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
div = parser.find_all('table') # Find all tables tag from the website
one = []
for divs in div:
# Find each nested table within the table
table = divs.find('table')
if table != None:
# Find all p under the nested table with the coj-normal class
p = table.find_all('p', class_="coj-normal")
for x in p:
# Span class of coj-bold under the p tag
span = x.find_all('span', class_="coj-bold")
for y in span:
if x != None and y != None:

# append text from span onto a list
one.append(y.text)
return one

def html_page_structure_two(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph
(p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
p = parser.find_all('p')
two = []
for para in p:

span = para.find('span')
if span != None:

if "operative" in span.text.lower():
normal = span.find_all_next('p', class_="normal")
for op in normal:

two.append(op.text)
return two

def structure_three(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested
table structure . The relevant text lies inside the coj-bold class of the span tag.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
table = parser.find_all('table')
three = []
for tables in table:
interior = tables.find_all('table')
for interiors in interior:
if interiors != None:
p = interiors.find_all('p', class_="coj-normal")
for x in p:
span = x.find_all('span', class_="coj-bold")
for y in span:
if x != None and y != None:

three.append(y.text)
return three

def structure_four(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph
(p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
p = parser.find_all('p')
four = []
for para in p:

span = para.find('span')
if span != None:

if "operative" in span.text.lower():
normal = span.find_all_next('table')
for op in normal:
tbody = op.find('tbody')
new_p = tbody.find_all('p', class_="oj-normal")

for subsequent in new_p:
if subsequent != None:

four.append(subsequent.text)

return four

def structure_five(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph
(p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
p = parser.find_all('p')
five = []
for para in p:

span = para.find('span')
if span != None:

if "operative" in span.text.lower():
normal = span.find_all_next('table')
for op in normal:
tbody = op.find('tbody')
new_p = tbody.find_all('p', class_="normal")

for subsequent in new_p:
if subsequent != None:

five.append(subsequent.text)

return five

def structure_six(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2
(header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag.
"""

website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
div = parser.find_all('h2')
six = []
for h2 in div:
# print(h2.text)
if h2.text == "Operative part":
operatives = h2.find_all_next('p')
for operative in operatives:

six.append(operative.text)
return six

def structure_seven(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table
(table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
div = parser.find_all('table')
seven = []
for divs in div:
# find tbody within the table
table = divs.find_all('tbody')
for tables in table:
if tables != None:
# find tr within the tbody
p = tables.find_all('tr')
for x in p:
if x != None:
# find td within the tr
td = x.find_all('td')
for y in td:
if y != None:
p = y.find_all('p', class_="normal")
for all in p:
if all != None:
# find operative part within the span
span = all.find_all(
'span', class_="bold")
for spans in span:
# APpend it into a list and return the list when the function is called
seven.append(spans.text)
return seven

def structure_eight(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside
the tbody tag.Returns a list as output.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')

tbody = parser.find_all('tbody')
eight = []
for all in tbody:
if all != None:
tr = all.find_all('tr')
for trs in tr:
if trs != None:

p = parser.find_all('p', class_="normal")
for paras in p:
if paras != None:
if "on those grounds" in paras.text.lower():

span = paras.find_all_next(
'span', class_="bold")
for spans in span:
if spans != None:
eight.append(spans.text)

return eight

def structure_nine(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b)
tag after the p tag where the keywords "on those grounds" exist.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
nine = []
div = parser.find_all('p')
for divs in div:
if divs != None:
if "on those grounds" in divs.text.lower():
b = divs.find_all_next('b')
for bolds in b:
# print(bolds.text)
nine.append(bolds.text)
return nine

def structure_eleven(self) -> list:
"""
This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p)
tag after the b tag where the keywords "operative part" exist.
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
bold = parser.find_all('b')

eleven = []

for b in bold:
if b != None:
if "operative part" in b.text.lower():
table = b.find_all_next('p')
for tables in table:
if tables != None:
eleven.append(tables.text)

return eleven

def structure_ten(self):
"""
This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s
server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase
"On those grounds".
"""
website = requests.get(
f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text
parser = BeautifulSoup(website, 'lxml')
appender = []
for string in parser.stripped_strings:

appender.append(string)

found = False
afterGrounds = []

for x in appender:

if "on those grounds" in x.lower():
found = True

if found:
if len(x.split(" ")) > 3:
afterGrounds.append(x)
return afterGrounds

def __call__(self) -> list:
"""
This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part .
"""

container = [self.html_page_structure_one(), self.html_page_structure_two(), self.structure_three(), self.structure_four(), self.structure_five(),
self.structure_six(), self.structure_seven(), self.structure_eight(), self.structure_nine(), self.structure_ten(), self.structure_eleven()]

one: list
for funcs in range(len(container)):

one = container[funcs]

if one:
if (len(one) != 0 or one[0] != "\n"):
print("here")
return one




class Writing():
"""
This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json)
"""

instance: str
x: str
parameter: str

def __init__(self, celex: str):
self.celex = celex
self.instance = Analyzer(self.celex)
self.x = self.instance()

def to_csv(self):
file = open("csv/output.csv", "a+")
writer = csv.writer(file)

if self.x != None:
writer.writerow([self.celex, self.x])

def to_json(self):
if self.x != None:
data = {'Celex': self.celex, "Operative part": self.x}
file = open('json/data.json', 'a+')
json.dump(data, file)
file.close()

def to_txt(self):

if self.x != None:
file = open(f"txt/{self.celex}.txt", "a")
for w in self.x:

file.write(w+"\n")
file.close()
# Sample code for reading celex id's froma tsv file


file = open("gijs_202310_node_list.tsv", "r")
reader = csv.reader(file)
testing = []
for row in reader:
for rows in row:
if "Id" not in rows:
testing.append(rows.split("\t")[0])
for all in testing:
instance = Writing(all)
instance.to_csv()
print(all)

0 comments on commit d5e1aa3

Please sign in to comment.