From 60e561db52c0cd6eeba784176a024129d894824d Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 15 Jul 2023 14:28:32 +0200 Subject: [PATCH 01/11] removing anything non-celllar --- .github/workflows/github-actions.yml | 2 - README.md | 3 +- RS_citations/README.md | 103 ----- .../__init__.py | 1 - .../citations_extractor.py | 332 --------------- .../testing.py | 10 - RS_citations/setup.py | 25 -- echr/README.md | 227 ---------- echr/echr_extractor/ECHR_html_downloader.py | 79 ---- .../echr_extractor/ECHR_metadata_harvester.py | 230 ----------- .../ECHR_nodes_edges_list_transform.py | 301 -------------- echr/echr_extractor/__init__.py | 1 - echr/echr_extractor/clean_ref.py | 9 - echr/echr_extractor/echr.py | 96 ----- echr/echr_extractor/testing_file.py | 32 -- echr/setup.py | 26 -- rechtspraak/README.md | 159 ------- rechtspraak/rechtspraak_extractor/__init__.py | 4 - .../rechtspraak_extractor/rechtspraak.py | 117 ------ .../rechtspraak_functions.py | 44 -- .../rechtspraak_metadata.py | 387 ------------------ .../rechtspraak_extractor/testing_file.py | 6 - .../rechtspraak_extractor/tests/__init__.py | 12 - .../tests/rechtspraak.py | 140 ------- .../tests/rechtspraak_functions.py | 41 -- .../tests/rechtspraak_metadata.py | 319 --------------- rechtspraak/setup.py | 25 -- tests.py | 65 --- 28 files changed, 2 insertions(+), 2794 deletions(-) delete mode 100644 RS_citations/README.md delete mode 100644 RS_citations/rechtspraak_citations_extractor/__init__.py delete mode 100644 RS_citations/rechtspraak_citations_extractor/citations_extractor.py delete mode 100644 RS_citations/rechtspraak_citations_extractor/testing.py delete mode 100644 RS_citations/setup.py delete mode 100644 echr/README.md delete mode 100644 echr/echr_extractor/ECHR_html_downloader.py delete mode 100644 echr/echr_extractor/ECHR_metadata_harvester.py delete mode 100644 echr/echr_extractor/ECHR_nodes_edges_list_transform.py delete mode 100644 echr/echr_extractor/__init__.py delete mode 100644 echr/echr_extractor/clean_ref.py delete mode 100644 echr/echr_extractor/echr.py delete mode 100644 echr/echr_extractor/testing_file.py delete mode 100644 echr/setup.py delete mode 100644 rechtspraak/README.md delete mode 100644 rechtspraak/rechtspraak_extractor/__init__.py delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak.py delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak_functions.py delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py delete mode 100644 rechtspraak/rechtspraak_extractor/testing_file.py delete mode 100644 rechtspraak/rechtspraak_extractor/tests/__init__.py delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak.py delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py delete mode 100644 rechtspraak/setup.py diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index d5f729f..ea25b00 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -18,8 +18,6 @@ jobs: run: | python -m pip install --upgrade pip pip install cellar-extractor - pip install rechtspraak-extractor - pip install echr-extractor # pip install echr-extractor - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - run: echo "🖥️ The workflow is now ready to test your code on the runner." diff --git a/README.md b/README.md index 7e1d5c9..34ee8a2 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # extraction_libraries -Python libraries for extracting from data sources like Rechtspraak, ECHR, Cellar +Python library for extracting caselaw data from Cellar. +Full documentation available at [cellar-extractor](https://pypi.org/project/cellar-extractor/). diff --git a/RS_citations/README.md b/RS_citations/README.md deleted file mode 100644 index ec640fa..0000000 --- a/RS_citations/README.md +++ /dev/null @@ -1,103 +0,0 @@ -## Rechtspraak citations -This library contains a function that aqcuires citation data for Rechtspraak cases using the LIDO. - -## Version -Python 3.9 - -## Contributors - - - - - - - - -
- - Cloud956 -
- Piotr Lewandowski -
-
- - shashankmc -
- shashankmc -
-
- - gijsvd -
- gijsvd -
-
- - -## How to install? -pip install rechtspraak_citations_extractor - -## What are the functions? -
  • Rechtspraak Citations Extractor -
      -
    1. get_citations
    2. - Gets all the data about case law citing/being cited and the legislations cited from the cases - in passed on DataFrame of case metadata. Requires a valid DataFrame object with a column titled 'ecli'. Returns the same Dataframe object, - with 3 additional columns containing JSON strings of citation information. -
  • - -## What are the parameters? -
      -
    1. get_citations(dataframe = None, username = '', password = '', threads = 2)
    2. - Parameters: - -
    - - -## Examples -``` -import rechtspraak_extractor as rex -import rechtspraak_citations_extractor as rex_citations ------------------------------------------------------------------------------------------------------------------------ - -# To get the rechtspraak data in a dataframe: -df = rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='y') # Gets 100 ECLIs from 1st August 2022 -df = get_rechtspraak_metadata(save_file='n',dataframe=df) -# To get the citations: -df_with_citaitons = rex_citations.get_citations(df,'username','password') -``` - - -## License -[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0) - -Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0). -``` -Apache License, Version 2.0 - -Copyright (c) 2022 Maastricht Law & Tech Lab - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -``` diff --git a/RS_citations/rechtspraak_citations_extractor/__init__.py b/RS_citations/rechtspraak_citations_extractor/__init__.py deleted file mode 100644 index 9d98e28..0000000 --- a/RS_citations/rechtspraak_citations_extractor/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from rechtspraak_citations_extractor.citations_extractor import get_citations diff --git a/RS_citations/rechtspraak_citations_extractor/citations_extractor.py b/RS_citations/rechtspraak_citations_extractor/citations_extractor.py deleted file mode 100644 index 8810807..0000000 --- a/RS_citations/rechtspraak_citations_extractor/citations_extractor.py +++ /dev/null @@ -1,332 +0,0 @@ -import requests -from lxml import etree -import urllib.request -import rdflib -import threading -import json -import pandas as pd -from dotenv import load_dotenv -from requests.auth import HTTPBasicAuth -from tqdm import tqdm -load_dotenv() - -LIDO_ENDPOINT = "http://linkeddata.overheid.nl/service/get-links" - -target_ecli = 'target_ecli' -label = 'label' -type = 'type' -ecli = 'ecli' -case_citations_fieldnames = [target_ecli, label, type] -legislation_citations_fieldnames = ['legal_provision_url_lido', 'legal_provision_url', 'legal_provision'] - - -def remove_spaces_from_ecli(ecli): - return ecli.replace(" ", "") - - -def write_incremental_rows(filename, data): - with open(filename, 'a') as f: - pd.DataFrame(data).to_csv(f, mode='a', header=not f.tell(), index=False) - - -# Code to execute LIDO API call -def get_lido_response(url, username, password): - authentication = HTTPBasicAuth(username, password) - response = requests.get(url, auth=authentication) - if response.status_code == 200: - return response.text - else: - raise Exception('LinkedData responded with code {}: {}. {}'.format(response.status_code, response.reason, url)) - - -# Extract the ECLI code from the LIDO identifier of the cited case law from the XML response from LIDO API -def get_ecli(sub_ref): - return sub_ref.attrib['idref'].split('/')[-1] - - -# Extract the LIDO identifier of the cited legislation from the XML response from LIDO API -def get_legislation_identifier(sub_ref): - return sub_ref.attrib['idref'] - - -# Find the webpage expressing, in writing, the legislation referred to by the input LIDO identifier -def get_legislation_webpage(identifier): - idcomponents = identifier.split("/") - date = idcomponents[len(idcomponents) - 1] - url = identifier - page = urllib.request.urlopen(url) - g = rdflib.Graph() - g.parse(page, format="xml") - article = "" - for s, p, o in g: - if str(p) == "http://purl.org/dc/terms/identifier": - article = o - if date in str(o): - return o - - return article - - -def get_legislation_name(url, username, password): - # turn the response into an xml tree - xml_response = get_lido_response(url, username, password) - xml = etree.fromstring(bytes(xml_response, encoding='utf8')) - - pref_label = "" - title = "" - # RDF main element (root) - for element in xml.iterchildren(): - # there is only one child and it is the "description" in which the rest of the info is - # go through all the tags (all the info) - for el in element.iterchildren(): - # the title (same thing as the preLabel) is the feature we want to be using - if el.tag == "{http://purl.org/dc/terms/}title": - title = el.text - - return title - - -# Check if outgoing links in the XML response from the LIDO API are of type "Jurisprudentie" (case law) -def is_case_law(sub_ref): - return sub_ref.attrib['groep'] == 'Jurisprudentie' - - -# Check if outgoing links in the XML response from the LIDO API are of type "Wet" (legislation) -def is_legislation(sub_ref): - return sub_ref.attrib['groep'] == 'Wet' or sub_ref.attrib['groep'] == 'Artikel' - - -# Extract ECLI code of citation from a lido identifier. -# Example of a LIDO identifier "https://linkeddata.overheid.nl/terms/bwb/id/BWBR0020368/8655654/2016-08-11/2016-08-11" -def get_lido_id(ecli): - return "http://linkeddata.overheid.nl/terms/jurisprudentie/id/" + ecli - - -# Method written by Marion -""" -These methods are used to write the citations incrementally to the csv file (in case it crashes or times out). -It allows us to stop the script whenever we want without loosing our data, and without having to start from the bginning the next time. -""" - - -# Main method to execute LIDO API call on a list of ECLIs from a CSV file and extract the citations of each -# Add the implementation of the incremental writing of rows -def find_citations_for_cases(dataframe, username, password): - df_eclis = dataframe.reset_index(drop=True) - - eclis = list(df_eclis['ecli'].dropna()) - total_incoming = [] - total_outgoing = [] - total_legislations = [] - - for i, ecli in enumerate(eclis): - case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case( - remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username, - password) - if case_citations_incoming: - total_incoming.extend(case_citations_incoming) - if case_citations_outgoing: - total_outgoing.extend(case_citations_outgoing) - if legislation_citations: - total_legislations.extend(legislation_citations) - df_incoming = pd.DataFrame(total_incoming) - df_outgoing = pd.DataFrame(total_outgoing) - df_legislations = pd.DataFrame(total_legislations) - return df_incoming, df_outgoing, df_legislations - - -def citations_multithread_single(big_incoming, big_outgoing, big_legislations, ecli, username, password, current_index,bar): - incoming_df = pd.Series([], dtype='string') - outgoing_df = pd.Series([], dtype='string') - legislations_df = pd.Series([], dtype='string') - for i, ecli in enumerate(ecli): - index = current_index + i - case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case( - remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username, - password) - if case_citations_incoming: - encoded = json.dumps(case_citations_incoming) - incoming_df[index] = encoded - if case_citations_outgoing: - encoded = json.dumps(case_citations_outgoing) - outgoing_df[index] = encoded - if legislation_citations: - encoded = json.dumps(legislation_citations) - legislations_df[index] = encoded - bar.update(1) - big_incoming.append(incoming_df) - big_outgoing.append(outgoing_df) - big_legislations.append(legislations_df) - - -def add_column_frow_list(data, name, list): - column = pd.Series([], dtype='string') - for l in list: - column = column.append(l) - column.sort_index(inplace=True) - data.insert(1, name, column) - - -def find_citations_for_cases_multithread(dataframe, username, password, threads): - ecli = dataframe['ecli'].dropna().reset_index(drop=True) - length = ecli.size - at_once_threads = int(length / threads) - big_incoming = [] - big_outgoing = [] - big_legislations = [] - threads = [] - bar = tqdm(total=length, colour="GREEN",position=0, leave=True,miniters=int(length/100),maxinterval=10000) - for i in range(0, length, at_once_threads): - curr_ecli = ecli[i:(i + at_once_threads)] - t = threading.Thread(target=citations_multithread_single, - args=[big_incoming, big_outgoing, big_legislations, curr_ecli, username, password, i,bar]) - threads.append(t) - for t in threads: - t.start() - for t in threads: - t.join() - add_column_frow_list(dataframe, 'citations_incoming', big_incoming) - add_column_frow_list(dataframe, 'citations_outgoing', big_outgoing) - add_column_frow_list(dataframe, 'legislations_cited', big_legislations) - return dataframe - - -def add_citations_no_duplicates(already_existing_list, element): - duplicate = False - new_ecli = get_ecli(element) - added_sth_new = True - for stored in already_existing_list: - if stored[target_ecli] == new_ecli: - added_sth_new = False - duplicate = True - break - if not duplicate: - already_existing_list.append({target_ecli: new_ecli, - label: element.attrib['label'], - type: element.attrib['type'].split('/id/')[1]}) - return added_sth_new - - -def add_legislations_no_duplicates(list, element): - duplicate = False - new_legislation = get_legislation_identifier(element) - added_sth_new = True - for legs in list: - if new_legislation == legs: - added_sth_new = False - duplicate = True - break - if not duplicate: - list.append(get_legislation_identifier(element)) - return added_sth_new - - -# Main method to execute LIDO API call on the ECLI code of the input case and extract the citations -def find_citations_for_case(ecli, case_citations_fieldnames, legislation_citations_fieldnames, username, password): - xml_elements = [] - case_law_citations_outgoing = [] - legislation_citations = [] - case_law_citations_incoming = [] - start_page = 0 - end_of_pages = False - outgoing = "uitgaande-links" - incoming = "inkomende-links" - - while not end_of_pages: - added_sth_new = False - url = "{}?id={}&start={}&rows={}&output=xml".format(LIDO_ENDPOINT, get_lido_id(ecli), start_page, 100) - start_page += 1 - - xml_text = get_lido_response(url, username, password) - xml_elements.append(etree.fromstring(xml_text.encode('utf8'))) - - for el in xml_elements: - - for sub in list(el.iterchildren('subject')): - - for the_citations in sub.iterchildren(outgoing): - for sub_ref in the_citations.iterchildren(): - if is_case_law(sub_ref): - added_sth_new = add_citations_no_duplicates(case_law_citations_outgoing, sub_ref) - elif is_legislation(sub_ref): - added_sth_new = add_legislations_no_duplicates(legislation_citations, sub_ref) - - for the_citations in sub.iterchildren(incoming): - for sub_ref in the_citations.iterchildren(): - if is_case_law(sub_ref): - added_sth_new = add_citations_no_duplicates(case_law_citations_incoming, sub_ref) - - if not added_sth_new or start_page > 15: - #print(start_page) - end_of_pages = True - - # Remove duplicates empties - - for item in case_law_citations_incoming: - if item[target_ecli] == "": - case_law_citations_incoming.remove(item) - for item in case_law_citations_outgoing: - if item[target_ecli] == "": - case_law_citations_outgoing.remove(item) - - # Remove input case ECLI (for some reason a case can cite itself...) - for dicts in case_law_citations_incoming: - if dicts[target_ecli] == remove_spaces_from_ecli(ecli): - case_law_citations_incoming.remove(dicts) - break - for dicts in case_law_citations_outgoing: - if dicts[target_ecli] == remove_spaces_from_ecli(ecli): - case_law_citations_outgoing.remove(dicts) - break - if (remove_spaces_from_ecli(ecli) in case_law_citations_incoming): - case_law_citations_incoming.remove(remove_spaces_from_ecli(ecli)) - - case_law_result_outgoing = extract_results_citations(case_law_citations_outgoing, ecli, case_citations_fieldnames) - case_law_results_incoming = extract_results_citations(case_law_citations_incoming, ecli, case_citations_fieldnames) - legislation_results = extract_results_legislations(legislation_citations, ecli, legislation_citations_fieldnames, - username, password) - - return case_law_results_incoming, case_law_result_outgoing, legislation_results - - -def extract_results_citations(list, ecli, fields): - list_of_all_results = [] - - for case_citation in list: - case_law_result = {key: None for key in fields} - case_law_result[fields[0]] = (remove_spaces_from_ecli(case_citation[target_ecli])) # Target ECLI - case_law_result[fields[1]] = (case_citation['label']) # Target ECLI - case_law_result[fields[2]] = (case_citation['type']) # Target ECLI - list_of_all_results.append(case_law_result) - return list_of_all_results - - -def extract_results_legislations(list, ecli, fields, username, password): - list_of_all_results = [] - - for leg_citation in list: - legislation_result = {key: None for key in fields} - legislation_result[fields[0]] = (leg_citation) # Target article - legislation_result[fields[1]] = (get_legislation_webpage(leg_citation)) # Target article webpage - legislation_result[fields[2]] = ( - get_legislation_name(leg_citation, username, password)) # pref label == article name - list_of_all_results.append(legislation_result) - return list_of_all_results - - -def get_citations(dataframe=None, username="", password="", threads=1): - if dataframe is None or not username or not password: - print("Incorrect arguments passed. Returning...") - return False - try: - get_lido_response(LIDO_ENDPOINT,username,password) - except: - print('LIDO cannot be accessed with these login details. Returning...') - return False - print('\n--- START OF RS CITATIONS EXTRACTIONS ---\n') - - # find citations, and save the file incrementally - df = find_citations_for_cases_multithread(dataframe, username, password, threads) - - print("\n--- DONE ---") - return df diff --git a/RS_citations/rechtspraak_citations_extractor/testing.py b/RS_citations/rechtspraak_citations_extractor/testing.py deleted file mode 100644 index 38f1cc7..0000000 --- a/RS_citations/rechtspraak_citations_extractor/testing.py +++ /dev/null @@ -1,10 +0,0 @@ -import pandas as pd - - -from citations_extractor import get_citations - -if __name__ == '__main__': - name = 'rechtspraak_2018-01-01_2023-06-02_17-45-29_metadata.csv' - data = pd.read_csv(name) - df = get_citations(data,'','',2) - b=2 \ No newline at end of file diff --git a/RS_citations/setup.py b/RS_citations/setup.py deleted file mode 100644 index 90ba586..0000000 --- a/RS_citations/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -# This file is required to create a python library - -from setuptools import find_packages, setup -from pathlib import Path - -p = Path("README.md") -long_descr = p.read_text() - -setup( - name='rechtspraak_citations_extractor', - packages=find_packages(include=['rechtspraak_citations_extractor']), - version='1.0.8', - description='Library for extracting rechtspraak citations via LIDO', - author='LawTech Lab', - license='MIT', - install_requires=['requests>=2.26.0', 'python_dotenv==0.15.0', 'pandas >=1.2.5','urllib3>=1.26.12','lxml>=4.6.3','tqdm'], - author_email='p.lewandowski@student.maastrichtuniversity.nl', - keywords=['rechtspraak', 'citations', 'rechtspraak citations', 'RS citations'], - long_description=long_descr, - long_description_content_type='text/markdown', - project_urls={ - "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries", - "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries", - }, -) \ No newline at end of file diff --git a/echr/README.md b/echr/README.md deleted file mode 100644 index 89558b6..0000000 --- a/echr/README.md +++ /dev/null @@ -1,227 +0,0 @@ -## echr extractor -This library contains functions to get ECHR data. - -## Version -Python 3.9 - -## Contributors - - - - - - - - - - - - -
    - - brodriguesdemiranda -
    - Benjamin Rodrigues de Miranda -
    -
    - - ChloeCro -
    - Chloe Crombach -
    -
    - - Cloud956 -
    - Piotr Lewandowski -
    -
    - - pranavnbapat -
    - Pranav Bapat -
    -
    - - running-machin -
    - running-machin -
    -
    - - shashankmc -
    - shashankmc -
    -
    - - gijsvd -
    - gijsvd -
    -
    - - -## How to install? -pip install echr-extractor - -## What are the functions? -
      -
    1. get_echr
    2. - Gets all of the available metadata for echr cases from the HUDOC database. - Can be saved in a file or returned in-memory. -
      -
    3. get_echr_extra
    4. - Gets all of the available metadata for echr cases from the HUDOC database. -On top of that downloads the full text for each case downloaded. Can be saved in a file or returned in-memory. -
      -
    5. get_nodes_edges
    6. - Gets all of the available nodes and edges for echr cases for given metadata from the HUDOC database. -
    - -## What are the parameters? -
      -
    1. get_echr
    2. - -
    3. get_echr_extra
    4. - -
    5. get_nodes_edges
    6. - -
    - -## Examples - -``` -import echr_extractor as echr - -Below are examples for in-file saving: - -df, json = echr.get_echr_extra(count=100,save_file='y',threads=10) -df = echr.get_echr(start_id=1,save_file='y',skip_missing_dates=True) - -Below are examples for in-memory saving: - -df, json = echr.get_echr_extra(start_id=20,end_id=3000,save_file='n') - -df = echr.get_echr(start_id=1000,count=2000,save_file='n',verbose=True) - -nodes, edges = echr.get_nodes_edges(metadata_path='data/echr_metadata.csv',save_file='n') -``` -``` - -## License -[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0) - -Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0). -``` -Apache License, Version 2.0 - -Copyright (c) 2022 Maastricht Law & Tech Lab - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -``` - - -## Appendix - -``` -To properly use the 'link' parameter of the extraction methods, the user should head to - -https://hudoc.echr.coe.int/eng#%20 - -There, the user can use the tools of Advanced Search of HUDOC to search for specific cases. -Afterwards*, the user can copy the link of the current website, and pass it on to the extraction methods. - - -* It should be noted that the link only updates after the 'search' button of the Advanced Search is clicked. - - - -The full list of fields is as follows: - -fields = ['itemid','applicability','application','appno','article','conclusion','decisiondate','docname', -'documentcollectionid','documentcollectionid2','doctype','doctypebranch','ecli','externalsources','extractedappno', -'importance','introductiondate','isplaceholder','issue','judgementdate','kpdate','kpdateAsText','kpthesaurus', -'languageisocode','meetingnumber','originatingbody','publishedby','Rank','referencedate','reportdate','representedby', -'resolutiondate',resolutionnumber','respondent','respondentOrderEng','rulesofcourt','separateopinion','scl', -'sharepointid','typedescription','nonviolation','violation'] - -``` -These fields can take different values, for more information head to https://hudoc.echr.coe.int. \ No newline at end of file diff --git a/echr/echr_extractor/ECHR_html_downloader.py b/echr/echr_extractor/ECHR_html_downloader.py deleted file mode 100644 index f857de8..0000000 --- a/echr/echr_extractor/ECHR_html_downloader.py +++ /dev/null @@ -1,79 +0,0 @@ -from bs4 import BeautifulSoup -import requests -import threading - -base_url = 'https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id=' - - -def get_full_text_from_html(html_text): - # This method turns the html code from the summary page into text - # It has different cases depending on the first character of the CELEX ID - # Should only be used for summaries extraction - soup = BeautifulSoup(html_text, "html.parser") - for script in soup(["script", "style"]): - script.extract() # rip it out - text = soup.get_text() - # break into lines and remove leading and trailing space on each - lines = (line.strip() for line in text.splitlines()) - # break multi-headlines into a line each - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - # drop blank lines - text = '\n'.join(chunk for chunk in chunks if chunk) - text = text.replace(",", "_") - return text - - -def download_full_text_main(df, threads): - item_ids = df['itemid'] - eclis = df['ecli'] - length = item_ids.size - if length>threads: - at_once_threads = int(length / threads) - else: - at_once_threads=length - all_dict = list() - threads = [] - for i in range(0, length, at_once_threads): - curr_ids = item_ids[i:(i + at_once_threads)] - curr_ecli = eclis[i:(i + at_once_threads)] - t = threading.Thread(target=download_full_text_separate, args=(curr_ids,curr_ecli, all_dict)) - threads.append(t) - for t in threads: - t.start() - for t in threads: - t.join() - - json_file=list() - for l in all_dict: - if len(l)>0: - json_file.extend(l) - return json_file - - -def download_full_text_separate(item_ids,eclis, dict_list): - full_list = [] - eclis = eclis.reset_index(drop=True) - item_ids = item_ids.reset_index(drop=True) - def download_html(item_ids,eclis): - retry_ids = [] - retry_eclis = [] - for i in range(len(item_ids)): - item_id=item_ids[i] - ecli=eclis[i] - try: - r = requests.get(base_url + item_id, timeout=1) - json_dict={ - 'item_id': item_id, - 'ecli': ecli, - 'full_text': get_full_text_from_html(r.text) - } - full_list.append(json_dict) - except Exception: - retry_ids.append(item_id) - retry_eclis.append(ecli) - return retry_ids, retry_eclis - - retry_ids, retry_eclis = download_html(item_ids, eclis) - download_html(retry_ids, retry_eclis) - dict_list.append(full_list) - diff --git a/echr/echr_extractor/ECHR_metadata_harvester.py b/echr/echr_extractor/ECHR_metadata_harvester.py deleted file mode 100644 index e93b661..0000000 --- a/echr/echr_extractor/ECHR_metadata_harvester.py +++ /dev/null @@ -1,230 +0,0 @@ -import requests -from datetime import datetime -import pandas as pd - - -def get_r(url, timeout, retry, verbose): - """ - Get data from a URL. If this is uncuccessful it is attempted again up to a number of tries - given by retry. If it is still unsuccessful the batch is skipped. - :param str url: The data source URL. - :param double timeout: The amount of time to wait for a response each attempt. - :param int retry: The number of times to retry upon failure. - :param bool verbose: Whether or not to print extra information. - """ - count = 0 - max_attempts = 20 - while count < max_attempts: - try: - r = requests.get(url, timeout=timeout) - return r - except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout): - count += 1 - if verbose: - print(f"Timeout. Retry attempt {count}.") - if count > retry: - if verbose: - print(f"Unable to connect to {url}. Skipping this batch.") - return None - return None - - -def basic_function(term, values): - values = ['"' + i + '"' for i in values] - main_body = list() - cut_term = term.replace('"', '') - for v in values: - main_body.append(f"({cut_term}={v}) OR ({cut_term}:{v})") - query = f"({' OR '.join(main_body)})" - return query - - -def link_to_query(link): - extra_cases_map = { - "bodyprocedure": '("PROCEDURE" ONEAR(n=1000) terms OR "PROCÉDURE" ONEAR(n=1000) terms)', - "bodyfacts": '("THE FACTS" ONEAR(n=1000) terms OR "EN FAIT" ONEAR(n=1000) terms)', - "bodycomplaints": '("COMPLAINTS" ONEAR(n=1000) terms OR "GRIEFS" ONEAR(n=1000) terms)', - "bodylaw": '("THE LAW" ONEAR(n=1000) terms OR "EN DROIT" ONEAR(n=1000) terms)', - "bodyreasons": '("FOR THESE REASONS" ONEAR(n=1000) terms OR "PAR CES MOTIFS" ONEAR(n=1000) terms)', - "bodyseparateopinions": '(("SEPARATE OPINION" OR "SEPARATE OPINIONS") ONEAR(n=5000) terms OR "OPINION ' - 'SÉPARÉE" ONEAR(n=5000) terms)', - "bodyappendix": '("APPENDIX" ONEAR(n=1000) terms OR "ANNEXE" ONEAR(n=1000) terms)' - } - - def full_text_function(term, values): - return f"({','.join(values)})" - - def date_function(term, values): - values = ['"' + i + '"' for i in values] - query = '(kpdate >= "first_term" AND kpdate <= "second_term")' - query = query.replace("first_term", values[0]) - query = query.replace("second_term", values[1]) - return query - - def advanced_function(term, values): - body = extra_cases_map.get(term) - query = body.replace("terms", ",".join(vals)) - return query - - query_map = { - "docname": basic_function, - "appno": basic_function, - "scl": basic_function, - "rulesofcourt": basic_function, - "applicability": basic_function, - "ecli": basic_function, - "conclusion": basic_function, - "resolutionnumber": basic_function, - "separateopinions": basic_function, - "externalsources": basic_function, - "kpthesaurus": basic_function, - "advopidentifier": basic_function, - "documentcollectionid2": basic_function, - "fulltext": full_text_function, - "kpdate": date_function, - "bodyprocedure": advanced_function, - "bodyfacts": advanced_function, - "bodycomplaints": advanced_function, - "bodylaw": advanced_function, - "bodyreasons": advanced_function, - "bodyseparateopinions": advanced_function, - "bodyappendix": advanced_function, - "languageisocode": basic_function - - } - start = link.index("{") - link_dictionary = eval(link[start:]) - base_query = 'https://hudoc.echr.coe.int/app/query/results?query=contentsitename:ECHR' \ - ' AND (NOT (doctype=PR OR doctype=HFCOMOLD OR doctype=HECOMOLD)) AND ' \ - 'inPutter&select={select}&sort=itemid%20Ascending&start={start}&length={length}' - query_elements = list() - for key in list(link_dictionary.keys()): - vals = link_dictionary.get(key) - funct = query_map.get(key) - query_elements.append(funct(key, vals)) - query_total = ' AND '.join(query_elements) - final_query = base_query.replace('inPutter', query_total) - # print(final_query) - # page = requests.get(final_query) - # results = eval(page.text) - # print(results.get('resultcount')) - return final_query - - -def get_echr_metadata(start_id, end_id, verbose, fields, start_date, end_date, link, language): - """ - Read ECHR metadata into a Pandas DataFrame. - :param int start_id: The index to start the search from. - :param int end_id: The index to end search at, where the default fetches all results. - :param date start_date: The point from which to save cases. - :param date end_date: The point before which to save cases. - :param bool verbose: Whether or not to print extra information. - """ - data = [] - if not fields: - fields = ['itemid', 'applicability', 'appno', 'article', 'conclusion', 'docname', - 'doctype', 'doctypebranch', 'ecli', 'importance', 'judgementdate', - 'languageisocode', 'originatingbody', 'violation', 'nonviolation', - 'extractedappno', 'scl', 'publishedby', 'representedby', 'respondent', - 'separateopinion', 'sharepointid', 'externalsources', 'issue', 'referencedate', - 'rulesofcourt', 'DocId', 'WorkId', 'Rank', 'Author', 'Size', 'Path', - 'Description', 'Write', 'CollapsingStatus', 'HighlightedSummary', - 'HighlightedProperties', 'contentclass', 'PictureThumbnailURL', - 'ServerRedirectedURL', 'ServerRedirectedEmbedURL', 'ServerRedirectedPreviewURL', - 'FileExtension', 'ContentTypeId', 'ParentLink', 'ViewsLifeTime', 'ViewsRecent', - 'SectionNames', 'SectionIndexes', 'SiteLogo', 'SiteDescription', 'deeplinks', - 'SiteName', 'IsDocument', 'LastModifiedTime', 'FileType', 'IsContainer', - 'WebTemplate', 'SecondaryFileExtension', 'docaclmeta', 'OriginalPath', - 'EditorOWSUSER', 'DisplayAuthor', 'ResultTypeIdList', 'PartitionId', 'UrlZone', - 'AAMEnabledManagedProperties', 'ResultTypeId', 'rendertemplateid'] - if link: - META_URL = link_to_query(link) - - else: - META_URL = 'http://hudoc.echr.coe.int/app/query/results' \ - '?query=(contentsitename=ECHR) AND ' \ - '(documentcollectionid2:"JUDGMENTS" OR ' \ - 'documentcollectionid2:"COMMUNICATEDCASES" OR ' \ - 'documentcollectionid2:"DECISIONS" OR ' \ - 'documentcollectionid2:"CLIN") AND ' \ - 'lang_inputter' \ - '&select={select}' + \ - '&sort=itemid Ascending' + \ - '&start={start}&length={length}' - - # An example url: "https://hudoc.echr.coe.int/app/query/results?query=(contentsitename=ECHR)%20AND%20(documentcollectionid2:%22JUDGMENTS%22%20OR%20documentcollectionid2:%22COMMUNICATEDCASES%22%20OR%20documentcollectionid2:%22DECISIONS%22%20OR%20documentcollectionid2:%22CLIN%22)&select=itemid,applicability,application,appno,article,conclusion,decisiondate,docname,documentcollectionid,%20documentcollectionid2,doctype,doctypebranch,ecli,externalsources,extractedappno,importance,introductiondate,%20isplaceholder,issue,judgementdate,kpdate,kpdateAsText,kpthesaurus,languageisocode,meetingnumber,%20originatingbody,publishedby,Rank,referencedate,reportdate,representedby,resolutiondate,%20resolutionnumber,respondent,respondentOrderEng,rulesofcourt,separateopinion,scl,sharepointid,typedescription,%20nonviolation,violation&sort=itemid%20Ascending&start=0&length=200" - - if start_date and end_date: - addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")' - elif start_date: - end_date = datetime.today().date() - addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")' - elif end_date: - start_date = '1900-01-01' - addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")' - else: - addition = '' - - if addition: - META_URL = META_URL.replace('(contentsitename=ECHR)', '(contentsitename=ECHR) AND ' + addition) - - META_URL = META_URL.replace(' ', '%20') - META_URL = META_URL.replace('"', '%22') - language_input = basic_function('languageisocode', language) - if not link: - META_URL = META_URL.replace('lang_inputter', language_input) - - META_URL = META_URL.replace('{select}', ','.join(fields)) - - - - url = META_URL.format(start=0, length=1) - print(url) - r = requests.get(url) - resultcount = r.json()['resultcount'] - print("available results: ", resultcount) - - if not end_id: - end_id = resultcount - if verbose: - print(f'Fetching {end_id - start_id} results from index {start_id} to index {end_id} ' - f'{f" and filtering cases after {start_date}" if start_date else ""} {f"and filtering cases before {end_date}" if end_date else "."}') - - timeout = 6 - retry = 3 - if start_id + end_id > 500: # HUDOC does not let you fetch more than 500 items in one go. - for i in range(start_id, end_id, 500): - if verbose: - print(" - Fetching information from cases {} to {}.".format(i, i + 500)) - # Format URL based on the incremented index. - url = META_URL.format(start=i, length=500) - if verbose: - print(url) - - # Get the response. - r = get_r(url, timeout, retry, verbose) - if r is not None: - # Get the results list - temp_dict = r.json()['results'] - # Get every document from the results list. - for result in temp_dict: - data.append(result['columns']) - - else: - # Format URL based on start and length - url = META_URL.format(start=start_id, length=end_id) - if verbose: - print(url) - - r = get_r(url, timeout, retry, verbose) - if r is not None: - # Get the results list - temp_dict = r.json()['results'] - # Get every document from the results list. - for result in temp_dict: - data.append(result['columns']) - - if len(data) == 0: - print("Search results ended up empty") - return False - return pd.DataFrame.from_records(data) diff --git a/echr/echr_extractor/ECHR_nodes_edges_list_transform.py b/echr/echr_extractor/ECHR_nodes_edges_list_transform.py deleted file mode 100644 index 2d4bd0c..0000000 --- a/echr/echr_extractor/ECHR_nodes_edges_list_transform.py +++ /dev/null @@ -1,301 +0,0 @@ -import numpy as np -import pandas as pd -import re -import dateparser -from echr_extractor.clean_ref import clean_pattern - - - -def open_metadata(PATH_metadata): - """ - Finds the ECHR metadata file and loads it into a dataframe - - param filename_metadata: string with path to metadata - """ - try: - df = pd.read_csv(PATH_metadata) # change hard coded path - return df - except FileNotFoundError: - print("File not found. Please check the path to the metadata file.") - return False - -def concat_metadata(df): - agg_func = {'itemid' : 'first', 'appno' : 'first', 'article' : 'first', 'conclusion' : 'first' , 'docname' : 'first' , 'doctype' : 'first', - 'doctypebranch' : 'first', 'ecli' : 'first', 'importance' : 'first', 'judgementdate' : 'first', 'languageisocode' : ', '.join, 'originatingbody' : 'first', - 'violation' : 'first', 'nonviolation' : 'first', 'extractedappno' : 'first', 'scl' : 'first'} - new_df = df.groupby('ecli').agg(agg_func) - print(new_df) - return new_df - -def get_language_from_metadata(df): - df = concat_metadata(df) - df.to_json('langisocode-nodes.json', orient="records") - -def metadata_to_nodesedgeslist(df): - """ - Returns a dataframe where column 'article' only contains a certain article - - param df: the complete dataframe from the metadata - """ - - return df - - -def retrieve_nodes_list(df): - """ - Returns a dataframe where 'ecli' is moved to the first column. - - param df: the dataframe after article filter - """ - df = metadata_to_nodesedgeslist(df) - col = df.pop("ecli") - df.insert(1, col.name, col) - df.drop(df.columns[0], axis=1, inplace=True) - return df - - -def retrieve_edges_list(df, df_unfiltered): - """ - Returns a dataframe consisting of 2 columns 'ecli' and 'reference' which - indicate a reference link between cases. - - params: - df -- the node list extracted from the metadata - df_unfiltered -- the complete dataframe from the metadata - """ - edges = pd.DataFrame(columns=['ecli', 'references']) - - count = 0 - tot_num_refs = 0 - missing_cases = [] - for index, item in df.iterrows(): - eclis = [] - app_number = [] - extracted_appnos = [] - if item.extractedappno is not np.nan: - extracted_appnos = item.extractedappno.split(';') - - if item.scl is not np.nan: - """ - Split the references from the scl column i nto a list of references. - - Example: - references in string: "Ali v. Switzerland, 5 August 1998, § 32, Reports of Judgments and - Decisions 1998-V;Sevgi Erdogan v. Turkey (striking out), no. 28492/95, 29 April 2003" - - ["Ali v. Switzerland, 5 August 1998, § 32, Reports of Judgments and - Decisions 1998-V", "Sevgi Erdogan v. Turkey (striking out), no. - 28492/95, 29 April 2003"] - """ - ref_list = item.scl.split(';') - new_ref_list = [] - for ref in ref_list: - ref = re.sub('\n', '', ref) - new_ref_list.append(ref) - - tot_num_refs = tot_num_refs + len(ref_list) - - for ref in new_ref_list: - app_number = re.findall("[0-9]{3,5}\/[0-9]{2}", ref) ################ - if len(extracted_appnos) > 0: - app_number = app_number + extracted_appnos - # app_number = app_number + extracted_appnos - app_number = set(app_number) - - if len(app_number) > 0: - # get dataframe with all possible cases by application number - if len(app_number) > 1: - app_number = [';'.join(app_number)] - case = lookup_app_number(app_number, df_unfiltered) - else: # if no application number in reference - # get dataframe with all possible cases by casename - case = lookup_casename(ref, df_unfiltered) - - if len(case) == 0: - case = lookup_casename(ref, df_unfiltered) - - components = ref.split(',') - # get the year of case - year_from_ref = get_year_from_ref(components) - - # remove cases in different language than reference - for id, it in case.iterrows(): - if 'v.' in components[0]: - lang = 'ENG' - else: - lang = 'FRE' - - if lang not in it.languageisocode: - case = case[case['languageisocode'].str.contains(lang, regex=False, flags=re.IGNORECASE)] - - for id, i in case.iterrows(): - if i.judgementdate is np.nan: - continue - date = dateparser.parse(i.judgementdate) - year_from_case = date.year - - if year_from_case - year_from_ref == 0: - case = case[case['judgementdate'].str.contains(str(year_from_ref), regex=False, flags=re.IGNORECASE)] - - #case = metadata_to_nodesedgeslist(case) - - if len(case) > 0: - if len(case) > 3: - print("stop") - for _,row in case.iterrows(): - eclis.append(row.ecli) - else: - count = count + 1 - missing_cases.append(ref) - - eclis = set(eclis) - - #add ecli to edges list - if len(eclis) == 0: - continue - else: - edges = pd.concat( - [edges, pd.DataFrame.from_records([{'ecli': item.ecli, 'references': list(eclis)}])]) - - print("num missed cases: ", count) - print("total num of refs: ", tot_num_refs) - missing_cases_set = set(missing_cases) - missing_cases = list(missing_cases_set) - - # Store missing references - missing_df = pd.DataFrame(missing_cases) - # missing_df.to_csv('C:/Users/Chloe/PycharmProjects/case-law-explorer/data/echr/missing_cases.csv', index=False, encoding='utf-8') - edges = edges.groupby('ecli', as_index=False).agg({'references' : 'sum'}) - return edges - -def lookup_app_number(pattern, df): - """ - Returns a list with rows containing the cases linked to the found app numbers. - """ - row = df.loc[df['appno'].isin(pattern)] - - if row.empty: - return pd.DataFrame() - elif row.shape[0] > 1: - return row - else: - return row - - -def lookup_casename(ref, df): - """ - Process the reference for lookup in metadata. - Returns the rows corresponding to the cases. - - - Example of the processing (2 variants) - - - Original reference from scl: - - Hentrich v. France, 22 September 1994, § 42, Series A no. 296-A - - Eur. Court H.R. James and Others judgment of 21 February 1986, - Series A no. 98, p. 46, para. 81 - - Split on ',' and take first item: - Hentrich v. France - Eur. Court H.R. James and Others judgment of 21 February 1986 - - If certain pattern from CLEAN_REF in case name, then remove: - Eur. Court H.R. James and Others judgment of 21 February 1986 --> - James and Others - - Change name to upper case and add additional text to match metadata: - Hentrich v. France --> CASE OF HENTRICH V. FRANCE - James and Others --> CASE OF JAMES AND OTHERS - """ - name = get_casename(ref) - - # DEV note: In case, add more patterns to clean_ref.py in future - patterns = clean_pattern - - uptext = name.upper() - - if 'NO.' in uptext: - uptext = uptext.replace('NO.', 'No.') - - if 'BV' in uptext: - uptext = uptext.replace('BV', 'B.V.') - - if 'v.' in name: - uptext = uptext.replace('V.', 'v.') - lang = 'ENG' - else: - uptext = uptext.replace('C.', 'c.') - lang = 'FRE' - - for pattern in patterns: - uptext = re.sub(pattern, '', uptext) - - uptext = re.sub(r'\[.*', "", uptext) - uptext = uptext.strip() - row = df[df['docname'].str.contains(uptext, regex=False, flags=re.IGNORECASE)] - - # if len(row) == 0: - # print("no cases matched: ", name) - - return row - -def get_casename(ref): - count = 0 - if 'v.' in ref: - slice_at_versus = ref.split('v.') # skip if typo (count how many) - elif 'c.' in ref: - slice_at_versus = ref.split('c.') - else: - count = count + 1 - name = ref.split(',') - return name[0] - - num_commas = slice_at_versus[0].count(',') - - if num_commas > 0: - num_commas = num_commas + 1 - name = ",".join(ref.split(",", num_commas)[:num_commas]) - else: - name = ref.split(',') - return name[0] - return name - -def get_year_from_ref(ref): - for component in ref: - if '§' in component: - continue - component = re.sub('judgment of ', "", component) - if dateparser.parse(component) is not None: - date = dateparser.parse(component) - elif ("ECHR" in component or "CEDH" in component): - if ("ECHR" in component or "CEDH" in component): - date = re.sub('ECHR ', '', component) - date = re.sub('CEDH ', '', date) - date = date.strip() - date = re.sub('-.*', '', date) - date = re.sub('\s.*', '', date) - date = dateparser.parse(date) - - try: - return date.year - except: - return 0 - - - -def echr_nodes_edges(metadata_path): - """ - Create nodes and edges list for the ECHR data. - """ - print('\n--- COLLECTING METADATA ---\n') - data = open_metadata(metadata_path) - - print('\n--- EXTRACTING NODES LIST ---\n') - nodes = retrieve_nodes_list(data) - # get_language_from_metadata(nodes) - - print('\n--- EXTRACTING EDGES LIST ---\n') - edges = retrieve_edges_list(nodes, data) - - # nodes.to_json(JSON_ECHR_NODES, orient="records") - # edges.to_json(JSON_ECHR_EDGES, orient="records") - return nodes, edges \ No newline at end of file diff --git a/echr/echr_extractor/__init__.py b/echr/echr_extractor/__init__.py deleted file mode 100644 index a3d8f80..0000000 --- a/echr/echr_extractor/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from echr_extractor.echr import get_echr,get_echr_extra,get_nodes_edges \ No newline at end of file diff --git a/echr/echr_extractor/clean_ref.py b/echr/echr_extractor/clean_ref.py deleted file mode 100644 index 5eb8d18..0000000 --- a/echr/echr_extractor/clean_ref.py +++ /dev/null @@ -1,9 +0,0 @@ -''' -This module contains the list of patterns for reference lookup in metadata -''' - - -clean_pattern = ['EUR. COURT H.R.', - 'JUDGMENT OF.*', - ' DU.*' - ] \ No newline at end of file diff --git a/echr/echr_extractor/echr.py b/echr/echr_extractor/echr.py deleted file mode 100644 index 1ed5987..0000000 --- a/echr/echr_extractor/echr.py +++ /dev/null @@ -1,96 +0,0 @@ -from echr_extractor.ECHR_metadata_harvester import get_echr_metadata -from echr_extractor.ECHR_html_downloader import download_full_text_main -from echr_extractor.ECHR_nodes_edges_list_transform import echr_nodes_edges -from pathlib import Path -import os -import json - -""" -I have replaced the function definition to take all arguments n eede to call read_echr_metadata and I have also -replaced the file naming lines. The old code is commented out I didn't delete anything. :) -On top of this the lines which deal with defining default values have been commented out because this is -handled in read_echr_metadata. It can also be done here but then it should be removed from the other method. -It will probably be necissary to add some file handling to prevent overwriting. I'm not sure if you have plans for this -already but feel free to shoot me a message about it seeing as I did something similar for the ECHR branch. @Benjamin -""" - - -def get_echr(start_id=0, end_id=None, start_date=None, count=None, end_date=None, verbose=False, save_file='y', - fields=None, link=None, language=None): - if language is None: - language = ["ENG"] - if count: - end_id = int(start_id) + count - print(f"--- STARTING ECHR DOWNLOAD FOR ---") - df = get_echr_metadata(start_id=start_id, end_id=end_id, start_date=start_date, end_date=end_date, - verbose=verbose, fields=fields, link=link, language=language) - if df is False: - return False - if save_file == "y": - filename = determine_filename(start_id, end_id, start_date, end_date) - Path('data').mkdir(parents=True, exist_ok=True) - file_path = os.path.join('data', filename + '.csv') - df.to_csv(file_path, index=False) - print("\n--- DONE ---") - return df - else: - print("\n--- DONE ---") - return df - - -def determine_filename(start_id, end_id, start_date, end_date): - if end_id: - if start_date and end_date: - filename = f"echr_metadata_index_{start_id}-{end_id}_dates_{start_date}-{end_date}" - elif start_date: - filename = f"echr_metadata_{start_id}-{end_id}_dates_{start_date}-END" - elif end_date: - filename = f"echr_metadata_{start_id}-{end_id}_datesSTART-{end_date}" - else: - filename = f"echr_metadata_{start_id}-{end_id}_dates_START-END" - else: - if start_date and end_date: - filename = f"echr_metadata_index_{start_id}-ALL_dates_{start_date}-{end_date}" - elif start_date: - filename = f"echr_metadata_{start_id}-ALL_dates_{start_date}-END" - elif end_date: - filename = f"echr_metadata_{start_id}-ALL_dates_START-{end_date}" - else: - filename = f"echr_metadata_{start_id}-ALL_dates_START-END" - return filename - - -def get_echr_extra(start_id=0, end_id=None, start_date=None, count=None, end_date=None, verbose=False, - save_file='y', threads=10, fields=None, link=None, language=None): - df = get_echr(start_id=start_id, end_id=end_id, start_date=start_date, end_date=end_date, verbose=verbose, - count=count, save_file='n', fields=fields, link=link, language=language) - print("Full-text download will now begin") - if df is False: - return False, False - json_list = download_full_text_main(df, threads) - print("Full-text download finished") - if save_file == "y": - filename = determine_filename(start_id, end_id, start_date, end_date) - filename_json = filename.replace("metadata", "full_text") - Path('data').mkdir(parents=True, exist_ok=True) - file_path = os.path.join('data', filename + '.csv') - df.to_csv(file_path, index=False) - file_path_json = os.path.join('data', filename_json + '.json') - with open(file_path_json, "w") as f: - json.dump(json_list, f) - return df, json_list - else: - return df, json_list - - -def get_nodes_edges(metadata_path, save_file='y'): - nodes, edges = echr_nodes_edges(metadata_path) - if save_file == "y": - Path('data').mkdir(parents=True, exist_ok=True) - edges.to_csv(os.path.join('data', 'ECHR_edges.csv'), index=False, encoding='utf-8') - nodes.to_csv(os.path.join('data', 'ECHR_nodes.csv'), index=False, encoding='utf-8') - nodes.to_json(os.path.join('data', 'ECHR_nodes.json'), orient="records") - edges.to_json(os.path.join('data', 'ECHR_edges.json'), orient="records") - return nodes, edges - - return nodes, edges diff --git a/echr/echr_extractor/testing_file.py b/echr/echr_extractor/testing_file.py deleted file mode 100644 index 29ff259..0000000 --- a/echr/echr_extractor/testing_file.py +++ /dev/null @@ -1,32 +0,0 @@ -import os,sys -from os.path import dirname, abspath -from pathlib import Path,PurePath - -current_dir = (abspath(__file__)) -correct_dir = '\\'.join(current_dir.replace('\\', '/').split('/')[:-2]) -sys.path.append(correct_dir) -# print(sys.path) - - -from echr_extractor.echr import get_echr_extra, get_echr, get_nodes_edges -import dateutil.parser - -import datetime -if __name__ == '__main__': - df = get_echr_extra(count=100,save_file='y',language=["FRE","ENG"],start_date='2023-01-01') - - - - - """ - Start and end dates must be date objects, which can be achieved by calling dateutil.parser.parse(some date string).date(). - I assume you dont want to do that in this file but im not sure where this conversion is most appropriate so I'll leave it up to you. - Note that there is an extra import because of this. - I have commented out some of your stuff to test this, if you run it as is it should work. @Benjamin - """ - print(str(datetime.datetime.today().date())) - #df = get_echr_extra(count=100,threads=5,start_date='2000-01-01',end_date='2023-01-01') - #df,json = get_echr_extra(start_id=20,end_id=3000,save_file='n') - - #df = get_echr(start_id=1000,count=2000,save_file='n') - diff --git a/echr/setup.py b/echr/setup.py deleted file mode 100644 index 8c5805a..0000000 --- a/echr/setup.py +++ /dev/null @@ -1,26 +0,0 @@ -# This file is required to create a python library - -from setuptools import find_packages, setup -from pathlib import Path - -p = Path("README.md") -long_descr = p.read_text() - -setup( - name='echr_extractor', - packages=find_packages(include=['echr_extractor']), - version='1.0.21', - description='Library for extracting ECHR data', - author='LawTech Lab', - license='MIT', - install_requires=["requests~=2.26.0","pandas~=1.2.5","beautifulsoup4~=4.9.3", "dateparser"], - author_email='a.gade@student.maastrichtuniversity.nl', - keywords=['echr', 'extractor', 'european', 'convention', 'human', 'rights', 'european convention', 'human rights', - 'european convention on human rights'], - long_description=long_descr, - long_description_content_type='text/markdown', - project_urls={ - "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries", - "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries", - }, -) \ No newline at end of file diff --git a/rechtspraak/README.md b/rechtspraak/README.md deleted file mode 100644 index a3904cf..0000000 --- a/rechtspraak/README.md +++ /dev/null @@ -1,159 +0,0 @@ -## Rechtspraak extractor -This library contains two functions to get rechtspraak data and metadata from the API. - -## Version -Python 3.9 - -## Contributors - - - - - - - - - - -
    - - pranavnbapat -
    - Pranav Bapat -
    -
    - - running-machin -
    - running-machin -
    -
    - - Cloud956 -
    - Piotr Lewandowski -
    -
    - - shashankmc -
    - shashankmc -
    -
    - - gijsvd -
    - gijsvd -
    -
    - - -## How to install? -pip install rechtspraak_extractor - -## What are the functions? -
  • Rechtspraak Extractor -
      -
    1. get_rechtspraak
    2. - Gets all the ECLIs and saves them in the CSV file or in-memory. -
      It gets, ECLI, title, summary, updated date, link. -
    3. get_rechtspraak_metadata
    4. - Gets the metadata of the ECLIs created by above function and saves them in the new CSV file or in-memory. -
      Link attribute that we get from the above function contains the links of ECLI metadata. -
      It gets instantie, datum uitspraak, datum publicatie, zaaknummer, rechtsgebieden, bijzondere kenmerken, - inhoudsindicatie, and vindplaatsen -
  • - -## What are the parameters? -
      -
    1. get_rechtspraak(max_ecli=100, sd='2022-05-01', ed='2022-10-01', save_file='y')
    2. - Parameters: - -
    3. get_rechtspraak_metadata
    4. - -
    - - -## Examples -``` -import rechtspraak_extractor as rex - ------------------------------------------------------------------------------------------------------------------------ - -# For rechtspraak - -# To get the rechtspraak data in a dataframe: -df = rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='n') # Gets 100 ECLIs from 1st August 2022 - -# To save rechtspraak data as a CSV file: -rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='y') - ------------------------------------------------------------------------------------------------------------------------ - -# For rechtspraak metadata - -# To get metadata as a dataframe from rechtspraak data (as a dataframe): -df_metadata = rex.get_rechtspraak_metadata(save_file='n', dataframe=df) - -# To get metadata as a dataframe from rechtspraak file (as a dataframe): -df_metadata = rex.get_rechtspraak_metadata(save_file='n', filename='rechtspraak.csv') - -# To get metadata as a dataframe from rechtspraak data (saved as CSV file): -rex.get_rechtspraak_metadata(save_file='y', dataframe=df) - -# To get metadata and save as a CSV file: -rex.get_rechtspraak_metadata(save_file='y', filename='rechtspraak.csv') - ------------------------------------------------------------------------------------------------------------------------ - -# filename='rechtspraak.csv' - filename.csv is a file from the data folder created by get_rechtspraak method -# dataframe=df - df is a dataframe created by get_rechtspraak method - -# Will not get any metadata -df = rex.get_rechtspraak_metadata(save_file='n') - -# Will get the metadata of all the files in the data folder -rex.get_rechtspraak_metadata(save_file='y') -``` - - -## License -[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0) - -Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0). -``` -Apache License, Version 2.0 - -Copyright (c) 2022 Maastricht Law & Tech Lab - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -``` diff --git a/rechtspraak/rechtspraak_extractor/__init__.py b/rechtspraak/rechtspraak_extractor/__init__.py deleted file mode 100644 index 13f583b..0000000 --- a/rechtspraak/rechtspraak_extractor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# from rechtspraak_extractor import rechtspraak -# from rechtspraak_extractor import rechtspraak_metadata -from rechtspraak_extractor.rechtspraak import get_rechtspraak -from rechtspraak_extractor.rechtspraak_metadata import get_rechtspraak_metadata diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak.py b/rechtspraak/rechtspraak_extractor/rechtspraak.py deleted file mode 100644 index aaac035..0000000 --- a/rechtspraak/rechtspraak_extractor/rechtspraak.py +++ /dev/null @@ -1,117 +0,0 @@ -# This file is used to get all the Rechtspraak ECLIs from an API. -# It takes two required arguments and one optional argument -# 1. max - Maximum number of ECLIs to retrieve -# 2. starting-date (yyyy-mm-dd) - Start date of ECLI publication -# 3. ending-date (yyyy-mm-dd) - It's an optional parameter. If not given, current date will be automatically chosen -# File is stored in data/rechtspraak folder - -import json -import xmltodict -import os -from datetime import date, datetime -from rechtspraak_extractor.rechtspraak_functions import * - - -# Define base URL -RECHTSPRAAK_API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?" - - -def get_data_from_url(url): - res = requests.get(url) - res.raw.decode_content = True - - # Convert the XML data to JSON format - xpars = xmltodict.parse(res.text) - json_string = json.dumps(xpars) - json_object = json.loads(json_string) - - # Get the JSON object from a specific branch - json_object = json_object['feed']['entry'] - - return json_object - - -def save_csv(json_object, file_name, save_file): - # Define the dataframe to enter the data - df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link']) - ecli_id = [] - title = [] - summary = [] - updated = [] - link = [] - - # Iterate over the object and fill the lists - for i in json_object: - ecli_id.append(i['id']) - title.append(i['title']['#text']) - if '#text' in i['summary']: - summary.append(i['summary']['#text']) - else: - summary.append("No summary available") - updated.append(i['updated']) - link.append(i['link']['@href']) - - # Save the lists to dataframe - df['id'] = ecli_id - df['title'] = title - df['summary'] = summary - df['updated'] = updated - df['link'] = link - - if save_file == 'y': - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - # Save CSV file - # file_path = os.path.join('data', file_name + '.csv') - df.to_csv('data/' + file_name + '.csv', index=False, encoding='utf8') - print("Data saved to CSV file successfully.") - return df - -def get_rechtspraak(max_ecli=100, sd='1900-01-01', ed=None, save_file='y'): - print("Rechtspraak dump downloader API") - - amount = max_ecli - starting_date = sd - save_file = save_file - - # If the end date is not entered, the current date is taken - today = date.today() - if ed: - ending_date = ed - else: - ending_date = today.strftime("%Y-%m-%d") - - # Used to calculate total execution time - start_time = time.time() - - # Build the URL after getting all the arguments - url = RECHTSPRAAK_API_BASE_URL + 'max=' + str(amount) + '&date=' + starting_date + '&date=' + ending_date - - print("Checking the API") - # Check the working of API - response_code = check_api(url) - if response_code == 200: - print("API is working fine!") - print("Getting " + str(amount) + " documents from " + starting_date + " till " + ending_date) - - json_object = get_data_from_url(url) - print(f"Found {len(json_object)} cases!") - if json_object: - # Get current time - current_time = datetime.now().strftime("%H-%M-%S") - - # Build file name - file_name = 'rechtspraak_' + starting_date + '_' + ending_date + '_' + current_time - - - get_exe_time(start_time) - - if save_file == 'n': - global_rs_df = save_csv(json_object, file_name, save_file) - return global_rs_df - else: - save_csv(json_object, file_name, save_file) - return - else: - print(f"URL returned with a {response_code} error code") diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py b/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py deleted file mode 100644 index 02fa28b..0000000 --- a/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py +++ /dev/null @@ -1,44 +0,0 @@ -import requests, glob, time -from pathlib import Path -import pandas as pd - - -# Check whether the API is working or not and return with the response code -def check_api(url): - response = requests.get(f"{url}") - - # Return with the response code - return response.status_code - - -# Reads all the CSV files in a folder and returns the list of files -# It also has an optional parameter "exclude". By default, it's None. If you want to exclude files having a certain -# word in the file name, you may give a value -# It also only grabs data if it has rechtspraak in it -# As that was causing issues with other csv data present -def read_csv(dir_name, exclude=None): - path = dir_name - csv_files = glob.glob(path + "/*.csv") - files = [] - for i in csv_files: - if exclude is not None: - if exclude not in i and "rechtspraak" in i: - files.append(i) - else: - if "rechtspraak" in i: - files.append(i) - - print("Found " + str(len(files)) + " CSV file(s)\n") - return files - - -# Get total execution time -def get_exe_time(start_time): - end_time = time.time() - sec = end_time - start_time - mins = sec // 60 - sec = sec % 60 - hours = mins // 60 - mins = mins % 60 - print("Total execution time: {0}:{1}:{2}".format(int(hours), int(mins), round(sec, 2))) - print("\n") diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py b/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py deleted file mode 100644 index c315359..0000000 --- a/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py +++ /dev/null @@ -1,387 +0,0 @@ -# This file is used for getting the metadata of the ECLIs obtained using rechspraak_api file. This file takes all the -# CSV file created by rechspraak_api, picks up ECLIs and links column, and using an API gets the metadata and saves it -# in another CSV file with metadata suffix. -# This happens in async manner. -import pathlib -import os -import urllib -import multiprocessing -from bs4 import BeautifulSoup -from datetime import datetime -from concurrent.futures import ThreadPoolExecutor -import platform -import shutil -from tqdm import tqdm -from rechtspraak_extractor.rechtspraak_functions import * -from functools import partial -# Define base url -RECHTSPRAAK_METADATA_API_BASE_URL = "http://data.rechtspraak.nl/uitspraken/content?id=" # old one = "https://uitspraken.rechtspraak.nl/#!/details?id=" -return_type = "&return=DOC" - -# Define empty lists where we'll store our data temporarily -ecli_df = [] -full_text_df = [] -creator_df = [] -date_decision_df = [] -issued_df = [] -zaaknummer_df = [] -type_df = [] -relations_df = [] -references_df = [] -subject_df = [] -procedure_df = [] -inhoudsindicatie_df = [] -hasVersion_df = [] - -threads = [] -max_workers = 0 - - -def get_cores(): - # max_workers is the number of concurrent processes supported by your CPU multiplied by 5. - # You can change it as per the computing power. - # Different python versions treat this differently. This is written as per python 3.6. - n_cores = multiprocessing.cpu_count() - - global max_workers - max_workers = n_cores-1 - # If the main process is computationally intensive: Set to the number of logical CPU cores minus one. - - print(f"Maximum " + str(max_workers) + " threads supported by your machine.") - - -def extract_data_from_xml(url): - with urllib.request.urlopen(url) as response: - xml_file = response.read() - return xml_file - - - -def check_if_df_empty(df): - if df.empty: - return True - return False - - -def get_text_if_exists(el): - try: - return el.text - except: - return '' - -def update_bar(bar, *args): - bar.update(1) - - -def save_data_when_crashed(ecli): - ecli_df.append(ecli) - full_text_df.append("") - creator_df.append("") - date_decision_df.append("") - issued_df.append("") - zaaknummer_df.append("") - type_df.append("") - relations_df.append("") - references_df.append("") - subject_df.append("") - procedure_df.append("") - inhoudsindicatie_df.append("") - hasVersion_df.append("") -def get_data_from_api(ecli_id): - url = RECHTSPRAAK_METADATA_API_BASE_URL + ecli_id + return_type - try: - response_code = check_api(url) - except: - save_data_when_crashed(ecli_id) - return - global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, type_df, \ - relations_df, references_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df - try: - if response_code == 200: - try: - # Extract data from xml - xml_object = extract_data_from_xml(url) - soup = BeautifulSoup(xml_object, features='xml') - # Get the data - creator = get_text_if_exists(soup.find("dcterms:creator")) - date_decision = get_text_if_exists(soup.find("dcterms:date")) - issued = get_text_if_exists(soup.find("dcterms:issued")) - zaaknummer = get_text_if_exists(soup.find("psi:zaaknummer")) - rs_type = get_text_if_exists(soup.find("dcterms:type")) - subject = get_text_if_exists(soup.find("dcterms:subject")) - relation = soup.findAll("dcterms:relation") - relatie = '' - for i in relation: - # append the string to relation - text = get_text_if_exists(i) - if text == '': - continue - else: - relatie += text + "\n" - relations = relatie - reference = soup.findAll("dcterms:references") - ref = '' - for u in reference: - text = get_text_if_exists(u) - # append the string to relation - if text =="": - continue - else: - ref += text + "\n" - references = ref - procedure = get_text_if_exists(soup.find("psi:procedure")) - inhoudsindicatie = get_text_if_exists(soup.find("inhoudsindicatie")) - hasVersion = get_text_if_exists(soup.find("dcterms:hasVersion")) - full_text = get_text_if_exists(soup.find("uitspraak")) - - ecli_df.append(ecli_id) - full_text_df.append(full_text) - creator_df.append(creator) - date_decision_df.append(date_decision) - issued_df.append(issued) - zaaknummer_df.append(zaaknummer) - type_df.append(rs_type) - relations_df.append(relations) - references_df.append(references) - subject_df.append(subject) - procedure_df.append(procedure) - inhoudsindicatie_df.append(inhoudsindicatie) - hasVersion_df.append(hasVersion) - del full_text, creator, date_decision, issued, zaaknummer,relations, rs_type,\ - references, subject,procedure, inhoudsindicatie, hasVersion - - urllib.request.urlcleanup() - - except Exception as e: - save_data_when_crashed(ecli_id) - else: - save_data_when_crashed(ecli_id) - except Exception as e: - save_data_when_crashed(ecli_id) - - -def get_rechtspraak_metadata(save_file='n', dataframe=None, filename=None): - if dataframe is not None and filename is not None: - print(f"Please provide either a dataframe or a filename, but not both") - return False - - if dataframe is None and filename is None and save_file == 'n': - print(f"Please provide at least a dataframe of filename when the save_file is \"n\"") - return False - - print("Rechtspraak metadata API") - - start_time = time.time() # Get start time - - no_of_rows = '' - rs_data = '' - csv_files = 0 - - # Check if dataframe is provided and is correct - if dataframe is not None: - if 'id' in dataframe and 'link' in dataframe: - rs_data = dataframe - no_of_rows = rs_data.shape[0] - else: - print("Dataframe is corrupted or does not contain necessary information to get the metadata.") - return False - - # Check if filename is provided and is correct - if filename is not None: - print("Reading " + filename + " from data folder") - file_check = pathlib.Path("data/" + filename) - if file_check.is_file(): - print("File found. Checking if metadata already exists") - # Check if metadata already exists - file_check = Path("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] - + "_metadata.csv") - if file_check.is_file(): - print("Metadata for " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + - ".csv already exists.") - return False - else: - rs_data = pd.read_csv('data/' + filename) - if 'id' in rs_data and 'link' in rs_data: - no_of_rows = rs_data.shape[0] - else: - print("File is corrupted or does not contain necessary information to get the metadata.") - return False - else: - print("File not found. Please check the file name.") - return False - - get_cores() # Get number of cores supported by the CPU - - if dataframe is None and filename is None and save_file == 'y': - print("No dataframe or file name is provided. Getting the metadata of all the files present in the " - "data folder") - - print("Reading all CSV files in the data folder...") - csv_files = read_csv('data', "metadata") - - global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \ - type_df, relations_df,references_df, subject_df,\ - procedure_df, inhoudsindicatie_df, hasVersion_df - if len(csv_files) > 0 and save_file == 'y': - for f in csv_files: - # Create empty dataframe - rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision', - 'issued', 'zaaknummer','type',"relations", - 'references','subject','procedure', - 'inhoudsindicatie', 'hasVersion']) - - temp_file_name = f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] - - # Check if file already exists - file_check = Path("data/" + temp_file_name + "_metadata.csv") - if file_check.is_file(): - print("Metadata for " + temp_file_name + ".csv already exists.") - continue - - df = pd.read_csv(f) - no_of_rows = df.shape[0] - print("Getting metadata of " + str(no_of_rows) + " ECLIs from " + temp_file_name + ".csv") - print("Working. Please wait...") - - # Get all ECLIs in a list - ecli_list = list(df.loc[:, 'id']) - - # Create a temporary directory to save files - time.sleep(1) - Path('temp_rs_data').mkdir(parents=True, exist_ok=True) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - for ecli in ecli_list: - threads.append(executor.submit(get_data_from_api, ecli)) - - # Delete temporary directory - shutil.rmtree('temp_rs_data') - # executor.shutdown() # Shutdown the executor - - rsm_df['ecli'] = ecli_df - rsm_df['full_text'] = full_text_df - rsm_df['creator'] = creator_df - rsm_df['date_decision'] = date_decision_df - rsm_df['issued'] = issued_df - rsm_df['zaaknummer'] = zaaknummer_df - rsm_df['type'] = type_df - rsm_df['relations'] = relations_df - rsm_df['references'] = references_df - rsm_df['subject'] = subject_df - rsm_df['procedure'] = procedure_df - rsm_df['inhoudsindicatie'] = inhoudsindicatie_df - rsm_df['hasVersion'] = hasVersion_df - addition = rs_data[['id', 'summary']] - rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1) - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - if check_if_df_empty(rsm_df): - print("Metadata not found. Please check the API response; either API is under maintenance, " - "experiencing problems, or has changed. Please try again after some time or contact the " - "administrator.\n") - else: - # Save CSV file - print("Creating CSV file...") - rsm_df.to_csv("data/" + temp_file_name + "_metadata.csv", index=False, encoding='utf8') - print("CSV file " + temp_file_name + "_metadata.csv successfully created.\n") - - # Clear the lists for the next file - ecli_df = [] - full_text_df = [] - creator_df = [] - date_decision_df = [] - issued_df = [] - zaaknummer_df = [] - type_df = [] - relations_df = [] - references_df = [] - subject_df = [] - procedure_df = [] - inhoudsindicatie_df = [] - hasVersion_df = [] - ecli_list = [] - del rsm_df - return True - - if rs_data is not None: - rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision', 'issued', - 'zaaknummer','type','relations','references', 'subject', 'procedure', - 'inhoudsindicatie','hasVersion']) - - print("Getting metadata of " + str(no_of_rows) + " ECLIs") - print("Working. Please wait...") - # Get all ECLIs in a list - ecli_list = list(rs_data.loc[:, 'id']) - - # Create a temporary directory to save files - Path('temp_rs_data').mkdir(parents=True, exist_ok=True) - time.sleep(1) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - bar = tqdm(total=len(ecli_list), colour="GREEN",position=0, leave=True, miniters=int(len(ecli_list)/100), - maxinterval=10000) - for ecli in ecli_list: - threads.append(executor.submit(get_data_from_api, ecli)) - for t in threads: - t.add_done_callback(partial(update_bar,bar)) - # Delete temporary directory - shutil.rmtree('temp_rs_data') - # to finish unfinished? - # global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \ - # relations_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df - - rsm_df['ecli'] = ecli_df - rsm_df['full_text'] = full_text_df - rsm_df['creator'] = creator_df - rsm_df['date_decision'] = date_decision_df - rsm_df['issued'] = issued_df - rsm_df['zaaknummer'] = zaaknummer_df - rsm_df['type'] = type_df - rsm_df['relations'] = relations_df - rsm_df['references'] = references_df - rsm_df['subject'] = subject_df - rsm_df['procedure'] = procedure_df - rsm_df['inhoudsindicatie'] = inhoudsindicatie_df - rsm_df['hasVersion'] = hasVersion_df - addition = rs_data[['id','summary']] - rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1) - if save_file == 'y': - if filename is None or filename == '': - filename = "custom_rechtspraak_" + datetime.now().strftime("%H-%M-%S") + ".csv" - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - if check_if_df_empty(rsm_df): - print("Metadata not found. Please check the API response; either API is under maintenance, " - "experiencing problems, or has changed. Please try again after some time or contact the " - "administrator.\n") - else: - # Save CSV file - print("Creating CSV file...") - rsm_df.to_csv("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv", - index=False, encoding='utf8') - print("CSV file " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv" + - " successfully created.\n") - - # Clear the lists for the next file - ecli_df = [] - full_text_df = [] - creator_df = [] - date_decision_df = [] - issued_df = [] - zaaknummer_df = [] - type_df = [] - relations_df = [] - references_df = [] - subject_df = [] - procedure_df = [] - inhoudsindicatie_df = [] - hasVersion_df = [] - ecli_list = [] - - get_exe_time(start_time) - - if save_file == 'n': - return rsm_df - - return True - diff --git a/rechtspraak/rechtspraak_extractor/testing_file.py b/rechtspraak/rechtspraak_extractor/testing_file.py deleted file mode 100644 index 63b7fd6..0000000 --- a/rechtspraak/rechtspraak_extractor/testing_file.py +++ /dev/null @@ -1,6 +0,0 @@ -from rechtspraak import * -from rechtspraak_metadata import * -df = get_rechtspraak(ed='1995-01-01',save_file='n',max_ecli=1000000) -df_2 = get_rechtspraak_metadata(save_file='n',dataframe=df) -b=2 -pass \ No newline at end of file diff --git a/rechtspraak/rechtspraak_extractor/tests/__init__.py b/rechtspraak/rechtspraak_extractor/tests/__init__.py deleted file mode 100644 index 84b56fc..0000000 --- a/rechtspraak/rechtspraak_extractor/tests/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys -import pathlib - -from rechtspraak import get_rechtspraak -from rechtspraak_metadata import get_rechtspraak_metadata - -df = get_rechtspraak(max_ecli=50, sd='2022-08-01', save_file='y') - -# df = get_rechtspraak_metadata(save_file='n') - -print(df.head()) -print(df.shape) \ No newline at end of file diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py deleted file mode 100644 index 3bd72c4..0000000 --- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py +++ /dev/null @@ -1,140 +0,0 @@ -# This file is used to get all the Rechtspraak ECLIs from an API. -# It takes two required arguments and one optional argument -# 1. max - Maximum number of ECLIs to retrieve -# 2. starting-date (yyyy-mm-dd) - Start date of ECLI publication -# 3. ending-date (yyyy-mm-dd) - It's an optional parameter. If not given, current date will be automatically chosen -# File is stored in data/rechtspraak folder - -import json -import xmltodict -import os -from datetime import date, datetime -from rechtspraak_extractor.rechtspraak_functions import * - - -# Define base URL -RECHTSPRAAK_API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?" - -rs_ecli_df = [] -rs_title_df = [] -rs_summary_df = [] -rs_updated_df = [] -rs_link_df = [] - - -def get_data_from_url(url): - res = requests.get(url) - res.raw.decode_content = True - - # Convert the XML data to JSON format - xpars = xmltodict.parse(res.text) - json_string = json.dumps(xpars) - json_object = json.loads(json_string) - - # Get the JSON object from a specific branch - json_object = json_object['feed']['entry'] - - return json_object - - -def save_csv(json_object, file_name, save_file): - # Define the dataframe to enter the data - df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link']) - ecli_id = [] - title = [] - summary = [] - updated = [] - link = [] - - # Iterate over the object and fill the lists - for i in json_object: - ecli_id.append(i['id']) - title.append(i['title']['#text']) - if '#text' in i['summary']: - summary.append(i['summary']['#text']) - else: - summary.append("No summary available") - updated.append(i['updated']) - link.append(i['link']['@href']) - - # Save the lists to dataframe - df['id'] = ecli_id - df['title'] = title - df['summary'] = summary - df['updated'] = updated - df['link'] = link - - if save_file == 'y': - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - # Save CSV file - # file_path = os.path.join('data', file_name + '.csv') - df.to_csv('data/' + file_name + '.csv', index=False, encoding='utf8') - print("Data saved to CSV file successfully.") - else: - rs_ecli_df.extend(ecli_id) - rs_title_df.extend(title) - rs_summary_df.extend(summary) - rs_updated_df.extend(updated) - rs_link_df.extend(link) - - -def get_rechtspraak(max_ecli=100, sd='2022-08-01', ed=None, save_file='y'): - print("Rechtspraak dump downloader API") - - amount = max_ecli - starting_date = sd - save_file = save_file - - # If the end date is not entered, the current date is taken - today = date.today() - if ed: - ending_date = ed - else: - ending_date = today.strftime("%Y-%m-%d") - - # Used to calculate total execution time - start_time = time.time() - - # Build the URL after getting all the arguments - url = RECHTSPRAAK_API_BASE_URL + 'max=' + str(amount) + '&date=' + starting_date + '&date=' + ending_date - - print("Checking the API") - # Check the working of API - response_code = check_api(url) - if response_code == 200: - print("API is working fine!") - print("Getting " + str(amount) + " documents from " + starting_date + " till " + ending_date) - - json_object = get_data_from_url(url) - - if json_object: - # Get current time - current_time = datetime.now().strftime("%H-%M-%S") - - # Build file name - file_name = 'rechtspraak_' + starting_date + '_' + ending_date + '_' + current_time - - save_csv(json_object, file_name, save_file) - get_exe_time(start_time) - - if save_file == 'n': - global rs_ecli_df, rs_title_df, rs_summary_df, rs_updated_df, rs_link_df - global_rs_df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link']) - global_rs_df['id'] = rs_ecli_df - global_rs_df['title'] = rs_title_df - global_rs_df['summary'] = rs_summary_df - global_rs_df['updated'] = rs_updated_df - global_rs_df['link'] = rs_link_df - print("Done") - - # Clear the lists for the next usage - rs_ecli_df = [] - rs_title_df = [] - rs_summary_df = [] - rs_updated_df = [] - rs_link_df = [] - return global_rs_df - else: - print(f"URL returned with a {response_code} error code") diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py deleted file mode 100644 index aa84043..0000000 --- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py +++ /dev/null @@ -1,41 +0,0 @@ -import requests, glob, time -from pathlib import Path -import pandas as pd - - -# Check whether the API is working or not and return with the response code -def check_api(url): - response = requests.get(f"{url}") - - # Return with the response code - return response.status_code - - -# Reads all the CSV files in a folder and returns the list of files -# It also has an optional parameter "exclude". By default, it's None. If you want to exclude files having a certain -# word in the file name, you may give a value -def read_csv(dir_name, exclude=None): - path = dir_name - csv_files = glob.glob(path + "/*.csv") - files = [] - for i in csv_files: - if exclude is not None: - if exclude not in i: - files.append(i) - else: - files.append(i) - - print("Found " + str(len(files)) + " CSV file(s)\n") - return files - - -# Get total execution time -def get_exe_time(start_time): - end_time = time.time() - sec = end_time - start_time - mins = sec // 60 - sec = sec % 60 - hours = mins // 60 - mins = mins % 60 - print("Total execution time: {0}:{1}:{2}".format(int(hours), int(mins), round(sec, 2))) - print("\n") diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py deleted file mode 100644 index 2cf9cbd..0000000 --- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py +++ /dev/null @@ -1,319 +0,0 @@ -# This file is used for getting the metadata of the ECLIs obtained using rechspraak_api file. This file takes all the -# CSV file created by rechspraak_api, picks up ECLIs and links column, and using an API gets the metadata and saves it -# in another CSV file with metadata suffix. -# This happens in async manner. -import pathlib -import os -import urllib -import multiprocessing -from bs4 import BeautifulSoup -from datetime import datetime -from concurrent.futures import ThreadPoolExecutor -import platform -import shutil - -from rechtspraak_extractor.rechtspraak_functions import * - -# Define base url -RECHTSPRAAK_METADATA_API_BASE_URL = "https://uitspraken.rechtspraak.nl/InzienDocument?id=" - -# Define empty lists where we'll store our data temporarily -ecli_df = [] -uitspraak_df = [] -instantie_df = [] -datum_uitspraak_df = [] -datum_publicatie_df = [] -zaaknummer_df = [] -rechtsgebieden_df = [] -bijzondere_kenmerken_df = [] -inhoudsindicatie_df = [] -vindplaatsen_df = [] - -threads = [] -max_workers = 0 - - -def get_cores(): - # max_workers is the number of concurrent processes supported by your CPU multiplied by 5. - # You can change it as per the computing power. - # Different python versions treat this differently. This is written as per python 3.6. - n_cores = multiprocessing.cpu_count() - - global max_workers - max_workers = n_cores * 5 - # If the main process is computationally intensive: Set to the number of logical CPU cores minus one. - - print(f"Maximum " + str(max_workers) + " threads supported by your machine.") - - -def extract_data_from_html(filename): - soup = BeautifulSoup(open("temp_rs_data/" + filename), "html.parser") - return soup - - -def get_data_from_api(ecli_id): - url = RECHTSPRAAK_METADATA_API_BASE_URL + ecli_id - response_code = check_api(url) - global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \ - rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df - try: - if response_code == 200: - try: - # Create HTML file - # html_file = ecli_id + ".html" - html_file = ecli_id.replace(":", "-") + ".html" - urllib.request.urlretrieve(url, "temp_rs_data/" + html_file) - - # Extract data from HTML - html_object = extract_data_from_html(html_file) - - soup = BeautifulSoup(str(html_object), features='lxml') - - # Get the data - uitspraak_info = soup.find_all("div", {"class": "uitspraak-info"}) - section = soup.find_all("div", {"class": "section"}) - - # We're using temporary variable "temp" to get the other metadata information such as instantie, - # datum uitspraak, datum publicatie, zaaknummer, rechtsgebieden, bijzondere kenmerken, - # inhoudsindicatie, and vindplaatsen - temp = soup.find_all("dl", {"class": "dl-horizontal"}) - instantie = BeautifulSoup(str(temp[0]('dd')[0]), features='lxml').get_text().strip() - datum_uitspraak = BeautifulSoup(str(temp[0]('dd')[1]), features='lxml').get_text().strip() - datum_publicatie = BeautifulSoup(str(temp[0]('dd')[2]), features='lxml').get_text().strip() - zaaknummer = BeautifulSoup(str(temp[0]('dd')[3]), features='lxml').get_text().strip() - rechtsgebieden = BeautifulSoup(str(temp[0]('dd')[4]), features='lxml').get_text().strip() - bijzondere_kenmerken = BeautifulSoup(str(temp[0]('dd')[5]), features='lxml').get_text().strip() - inhoudsindicatie = BeautifulSoup(str(temp[0]('dd')[6]), features='lxml').get_text().strip() - vindplaatsen = BeautifulSoup(str(temp[0]('dd')[7]), features='lxml').get_text().strip() - - uitspraak = BeautifulSoup(str(uitspraak_info), features='lxml').get_text() - uitspraak = uitspraak + BeautifulSoup(str(section), features='lxml').get_text() - - ecli_df.append(ecli_id) - uitspraak_df.append(uitspraak) - instantie_df.append(instantie) - datum_uitspraak_df.append(datum_uitspraak) - datum_publicatie_df.append(datum_publicatie) - zaaknummer_df.append(zaaknummer) - rechtsgebieden_df.append(rechtsgebieden) - bijzondere_kenmerken_df.append(bijzondere_kenmerken) - inhoudsindicatie_df.append(inhoudsindicatie) - vindplaatsen_df.append(vindplaatsen) - - del uitspraak, instantie, datum_uitspraak, datum_publicatie, zaaknummer, rechtsgebieden, \ - bijzondere_kenmerken, inhoudsindicatie, vindplaatsen - - # BS4 creates an HTML file to get the data. Remove the file after use - if os.path.exists("temp_rs_data/" + html_file): - os.remove("temp_rs_data/" + html_file) - urllib.request.urlcleanup() - - except urllib.error.URLError as e: - print(e) - except urllib.error.HTTPError as e: - print(e) - except Exception as e: - print(e) - else: - ecli_df.append(ecli_id) - uitspraak_df.append("API returned with error code: " + str(response_code)) - except requests.exceptions.RequestException as e: - raise SystemExit(e) - - -def get_rechtspraak_metadata(save_file='n', dataframe=None, filename=None): - if dataframe is not None and filename is not None: - print(f"Please provide either a dataframe or a filename, but not both") - return False - - if dataframe is None and filename is None and save_file == 'n': - print(f"Please provide at least a dataframe of filename when the save_file is \"n\"") - return False - - print("Rechtspraak metadata API") - - start_time = time.time() # Get start time - - no_of_rows = '' - rs_data = '' - csv_files = 0 - - # Check if dataframe is provided and is correct - if dataframe is not None: - if 'id' in dataframe and 'link' in dataframe: - rs_data = dataframe - no_of_rows = rs_data.shape[0] - else: - print("Dataframe is corrupted or does not contain necessary information to get the metadata.") - return False - - # Check if filename is provided and is correct - if filename is not None: - print("Reading " + filename + " from data folder") - file_check = pathlib.Path("data/" + filename) - if file_check.is_file(): - print("File found. Checking if metadata already exists") - # Check if metadata already exists - file_check = Path("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] - + "_metadata.csv") - if file_check.is_file(): - print("Metadata for " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + - ".csv already exists.") - return False - else: - rs_data = pd.read_csv('data/' + filename) - if 'id' in rs_data and 'link' in rs_data: - no_of_rows = rs_data.shape[0] - else: - print("File is corrupted or does not contain necessary information to get the metadata.") - return False - else: - print("File not found. Please check the file name.") - return False - - get_cores() # Get number of cores supported by the CPU - - if dataframe is None and filename is None and save_file == 'y': - print("No dataframe or file name is provided. Getting the metadata of all the files present in the " - "data folder") - - print("Reading all CSV files in the data folder...") - csv_files = read_csv('data', "metadata") - - global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \ - rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df - - if len(csv_files) > 0 and save_file == 'y': - for f in csv_files: - # Create empty dataframe - rsm_df = pd.DataFrame(columns=['ecli_id', 'uitspraak', 'instantie', 'datum_uitspraak', - 'datum_publicatie', 'zaaknummer', 'rechtsgebieden', - 'bijzondere_kenmerken', 'inhoudsindicatie', 'vindplaatsen']) - - # Check if file already exists - file_check = Path("data/" + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv") - if file_check.is_file(): - print("Metadata for " + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + ".csv already exists.") - continue - - df = pd.read_csv(f) - no_of_rows = df.shape[0] - print("Getting metadata of " + str(no_of_rows) + " ECLIs from " + - f.split('/')[-1][:len(f.split('/')[-1]) - 4] + ".csv") - print("Working. Please wait...") - - # Get all ECLIs in a list - ecli_list = list(df.loc[:, 'id']) - - # Create a temporary directory to save files - Path('temp_rs_data').mkdir(parents=True, exist_ok=True) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - for ecli in ecli_list: - threads.append(executor.submit(get_data_from_api, ecli)) - - # Delete temporary directory - shutil.rmtree('temp_rs_data') - # executor.shutdown() # Shutdown the executor - - # Save CSV file - print("Creating CSV file...") - - rsm_df['ecli_id'] = ecli_df - rsm_df['uitspraak'] = uitspraak_df - rsm_df['instantie'] = instantie_df - rsm_df['datum_uitspraak'] = datum_uitspraak_df - rsm_df['datum_publicatie'] = datum_publicatie_df - rsm_df['zaaknummer'] = zaaknummer_df - rsm_df['rechtsgebieden'] = rechtsgebieden_df - rsm_df['bijzondere_kenmerken'] = bijzondere_kenmerken_df - rsm_df['inhoudsindicatie'] = inhoudsindicatie_df - rsm_df['vindplaatsen'] = vindplaatsen_df - - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - rsm_df.to_csv("data/" + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv", - index=False, encoding='utf8') - print("CSV file " + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv" + - " successfully created.\n") - - # Clear the lists for the next file - ecli_df = [] - uitspraak_df = [] - instantie_df = [] - datum_uitspraak_df = [] - datum_publicatie_df = [] - zaaknummer_df = [] - rechtsgebieden_df = [] - bijzondere_kenmerken_df = [] - inhoudsindicatie_df = [] - vindplaatsen_df = [] - ecli_list = [] - del rsm_df - return True - - if rs_data is not None: - rsm_df = pd.DataFrame(columns=['ecli_id', 'uitspraak', 'instantie', 'datum_uitspraak', 'datum_publicatie', - 'zaaknummer', 'rechtsgebieden', 'bijzondere_kenmerken', 'inhoudsindicatie', - 'vindplaatsen']) - - print("Getting metadata of " + str(no_of_rows) + " ECLIs") - print("Working. Please wait...") - # Get all ECLIs in a list - ecli_list = list(rs_data.loc[:, 'id']) - - # Create a temporary directory to save files - Path('temp_rs_data').mkdir(parents=True, exist_ok=True) - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - for ecli in ecli_list: - threads.append(executor.submit(get_data_from_api, ecli)) - - # Delete temporary directory - shutil.rmtree('temp_rs_data') - - # global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \ - # rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df - - rsm_df['ecli_id'] = ecli_df - rsm_df['uitspraak'] = uitspraak_df - rsm_df['instantie'] = instantie_df - rsm_df['datum_uitspraak'] = datum_uitspraak_df - rsm_df['datum_publicatie'] = datum_publicatie_df - rsm_df['zaaknummer'] = zaaknummer_df - rsm_df['rechtsgebieden'] = rechtsgebieden_df - rsm_df['bijzondere_kenmerken'] = bijzondere_kenmerken_df - rsm_df['inhoudsindicatie'] = inhoudsindicatie_df - rsm_df['vindplaatsen'] = vindplaatsen_df - - if save_file == 'y': - if filename is None or filename == '': - filename = "custom_rechtspraak_" + datetime.now().strftime("%H-%M-%S") + ".csv" - # Create directory if not exists - Path('data').mkdir(parents=True, exist_ok=True) - - rsm_df.to_csv("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv", - index=False, encoding='utf8') - print("CSV file " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv" + - " successfully created.\n") - - # Clear the lists for the next file - ecli_df = [] - uitspraak_df = [] - instantie_df = [] - datum_uitspraak_df = [] - datum_publicatie_df = [] - zaaknummer_df = [] - rechtsgebieden_df = [] - bijzondere_kenmerken_df = [] - inhoudsindicatie_df = [] - vindplaatsen_df = [] - ecli_list = [] - - get_exe_time(start_time) - - if save_file == 'n': - return rsm_df - - return True - diff --git a/rechtspraak/setup.py b/rechtspraak/setup.py deleted file mode 100644 index add6313..0000000 --- a/rechtspraak/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -# This file is required to create a python library - -from setuptools import find_packages, setup -from pathlib import Path - -p = Path("README.md") -long_descr = p.read_text() - -setup( - name='rechtspraak_extractor', - packages=find_packages(include=['rechtspraak_extractor']), - version='1.1.17', - description='Library for extracting rechtspraak data', - author='LawTech Lab', - license='MIT', - install_requires=['bs4', 'lxml==4.6.3', 'requests==2.26.0', 'xmltodict==0.13.0', 'python_dotenv==0.15.0', 'pandas','tqdm'], - author_email='pranav.bapat@student.maastrichtuniversity.nl', - keywords=['rechtspraak', 'extractor', 'rechtspraak extractor'], - long_description=long_descr, - long_description_content_type='text/markdown', - project_urls={ - "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries", - "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries", - }, -) \ No newline at end of file diff --git a/tests.py b/tests.py index adcb207..4732cdb 100644 --- a/tests.py +++ b/tests.py @@ -1,26 +1,4 @@ from cellar_extractor import * -from echr_extractor import * -from rechtspraak_extractor import * - -def echr_y(): - get_echr(save_file='y',count=100,start_date='2022-01-01') - -def echr_n(): - get_echr(save_file='n',count=100,start_date='2022-01-01') - -def echr_extra_y(): - get_echr_extra(save_file='n',count=100,start_date='2022-01-01') - -def echr_extra_n(): - get_echr_extra(save_file='n',count=100,start_date='2022-01-01') - -def rechtspraak_n(): - df = get_rechtspraak(max_ecli=100,sd='2022-01-01',save_file='n') - get_rechtspraak_metadata(save_file='n',dataframe=df) - -def rechtspraak_y(): - df = get_rechtspraak(max_ecli=100,sd='2022-01-01',save_file='y') - get_rechtspraak_metadata(save_file='y',dataframe=df) def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -92,46 +70,3 @@ def test_cellar_json_n(): assert True except Exception: assert False, "Downloading cellar as json failed." - - -def test_echr_extra_y(): - try: - echr_extra_y() - assert True - except Exception: - assert False, "Saving extra echr failed" - -def test_echr_extra_n(): - try: - echr_extra_n() - assert True - except Exception: - assert False, "Downloading extra echr failed" - -def test_echr_y(): - try: - echr_y() - assert True - except Exception: - assert False, "Saving echr failed" - -def test_echr_n(): - try: - echr_n() - assert True - except Exception: - assert False, "Downloading echr failed" - -def test_rechtspraak_y(): - try: - rechtspraak_y() - assert True - except Exception: - assert False, "Saving extra rechtspraak failed" - -def test_rechtspraak_n(): - try: - rechtspraak_n() - assert True - except Exception: - assert False, "Downloading extra rechtspraak failed" From 6231dd6bfb060b4c832ba9e41959297cd3fcd2a0 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 15 Jul 2023 15:14:18 +0200 Subject: [PATCH 02/11] fixing cellar pandas error --- cellar/cellar_extractor/fulltext_saving.py | 2 +- cellar/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py index 44af01c..c4895db 100644 --- a/cellar/cellar_extractor/fulltext_saving.py +++ b/cellar/cellar_extractor/fulltext_saving.py @@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None): def add_column_frow_list(data, name, list): column = pd.Series([], dtype='string') for l in list: - column = column.append(l) + column = pd.concart(column,l) column.sort_index(inplace=True) data.insert(1, name, column) diff --git a/cellar/setup.py b/cellar/setup.py index 4a4a614..cc3037e 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.50', + version='1.0.51', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From 808fd595177c40bbc0ceb1fb07e156fc7d8f8036 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 15 Jul 2023 15:20:17 +0200 Subject: [PATCH 03/11] fixing cellar pandas error #2 --- cellar/cellar_extractor/fulltext_saving.py | 2 +- cellar/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py index c4895db..49fe4d1 100644 --- a/cellar/cellar_extractor/fulltext_saving.py +++ b/cellar/cellar_extractor/fulltext_saving.py @@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None): def add_column_frow_list(data, name, list): column = pd.Series([], dtype='string') for l in list: - column = pd.concart(column,l) + column = pd.concat(column,l) column.sort_index(inplace=True) data.insert(1, name, column) diff --git a/cellar/setup.py b/cellar/setup.py index cc3037e..7394c42 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.51', + version='1.0.52', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From b4071f0603fbeae6e58a170ac1b0c7dec354d9b1 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 15 Jul 2023 15:25:46 +0200 Subject: [PATCH 04/11] fixing cellar pandas error #3 --- cellar/cellar_extractor/fulltext_saving.py | 2 +- cellar/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py index 49fe4d1..8bb03c3 100644 --- a/cellar/cellar_extractor/fulltext_saving.py +++ b/cellar/cellar_extractor/fulltext_saving.py @@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None): def add_column_frow_list(data, name, list): column = pd.Series([], dtype='string') for l in list: - column = pd.concat(column,l) + column = pd.concat([column,l]) column.sort_index(inplace=True) data.insert(1, name, column) diff --git a/cellar/setup.py b/cellar/setup.py index 7394c42..5473f4f 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.52', + version='1.0.53', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From 7d433121f9009ccd7d49e5b5c9cf2cfa67bbc7e8 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Thu, 3 Aug 2023 11:58:41 +0200 Subject: [PATCH 05/11] filtering by subject matter method added --- cellar/README.md | 9 ++++++ cellar/cellar_extractor/Testing_file.py | 3 +- cellar/cellar_extractor/__init__.py | 3 +- cellar/cellar_extractor/cellar.py | 36 ++++++++++++++++-------- cellar/cellar_extractor/csv_extractor.py | 1 + cellar/setup.py | 2 +- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/cellar/README.md b/cellar/README.md index c870daa..6447a9d 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -57,6 +57,8 @@ Python 3.9
  • get_nodes_and_edges_lists
  • Gets 2 list objects, one for the nodes and edges of the citations within the passed dataframe. Allows the creation of a network graph of the citations. Can only be returned in-memory. +
  • filter_subject_matter
  • + Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases.
    @@ -103,6 +105,13 @@ Python 3.9 DataFrame of cellar metadata acquired from the get_cellar_extra method with eurlex webservice credentials passed. This method will only work on dataframes with citations data. +
  • filter_subject_matter
  • +
      +
    • df: DataFrame object, required, default None
    • + DataFrame of cellar metadata acquired from any of the cellar extraction methods listed above. +
    • phrase: string, required, default None
    • + The phrase which has to be present in the subject matter of cases. Case insensitive. +
    diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py index d2d81b3..8204c52 100644 --- a/cellar/cellar_extractor/Testing_file.py +++ b/cellar/cellar_extractor/Testing_file.py @@ -25,5 +25,6 @@ cits = get_citations_with_extra_info(text) print(cits) data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n') - nodes_edges = get_nodes_and_edges_lists(data) + d3 = filter_subject_matter(data, "prices") + b=2 pass \ No newline at end of file diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py index 3f96902..7821b0c 100644 --- a/cellar/cellar_extractor/__init__.py +++ b/cellar/cellar_extractor/__init__.py @@ -1,3 +1,4 @@ from cellar_extractor.cellar import get_cellar from cellar_extractor.cellar import get_cellar_extra -from cellar_extractor.cellar import get_nodes_and_edges_lists \ No newline at end of file +from cellar_extractor.cellar import get_nodes_and_edges_lists +from cellar_extractor.cellar import filter_subject_matter diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index 24c4a67..4dac929 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -1,14 +1,16 @@ import json import os -from os.path import join +import time from datetime import datetime from pathlib import Path + from tqdm import tqdm + +from cellar_extractor.cellar_extra_extract import extra_cellar from cellar_extractor.cellar_queries import get_all_eclis, get_raw_cellar_metadata from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning -from cellar_extractor.cellar_extra_extract import extra_cellar from cellar_extractor.nodes_and_edges import get_nodes_and_edges -import time + def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'): if not ed: @@ -28,7 +30,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma return False all_eclis = {} concurrent_docs = 100 - for i in tqdm(range(0, len(eclis), concurrent_docs),colour="GREEN"): + for i in tqdm(range(0, len(eclis), concurrent_docs), colour="GREEN"): new_eclis = get_raw_cellar_metadata(eclis[i:(i + concurrent_docs)]) all_eclis = {**all_eclis, **new_eclis} if save_file == 'y': @@ -62,23 +64,35 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre file_path = os.path.join('data', file_name + '.csv') if save_file == 'y': Path('data').mkdir(parents=True, exist_ok=True) - extra_cellar(data = data ,filepath=file_path, threads=threads, username=username, password=password) + extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password) print("\n--- DONE ---") else: - data,json = extra_cellar(data= data, threads = threads, username= username,password=password) + data, json = extra_cellar(data=data, threads=threads, username=username, password=password) print("\n--- DONE ---") - return data,json + return data, json -def get_nodes_and_edges_lists(df = None): + +def get_nodes_and_edges_lists(df=None): if df is None: print("No dataframe passed!") return else: try: - nodes,edges = get_nodes_and_edges(df) + nodes, edges = get_nodes_and_edges(df) except: print('Something went wrong. Nodes and edges creation unsuccessful.') - return False,False - return nodes,edges + return False, False + return nodes, edges + + +def filter_subject_matter(df=None, phrase=None): + if df is None or phrase is None: + print("Incorrect input values! \n Returning... \n") + else: + try: + mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase) + return df[mask] + except: + print("Something went wrong!\n Returning... \n") diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py index 23ee71c..b623aac 100644 --- a/cellar/cellar_extractor/csv_extractor.py +++ b/cellar/cellar_extractor/csv_extractor.py @@ -24,6 +24,7 @@ def extract_rows(data, number): print("") print("EXTRACTION FROM CSV FILE IN DATA PROCESSED DIR STARTED") print("") + DIR_DATA_RAW='' csv_files = (glob.glob(DIR_DATA_RAW + "/" + "*.csv")) print(f"FOUND {len(csv_files)} CSV FILES") diff --git a/cellar/setup.py b/cellar/setup.py index 5473f4f..2ad4544 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.53', + version='1.0.54', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From ec3829eaac04652873583a30714c9f7729b967d8 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Fri, 4 Aug 2023 15:41:19 +0200 Subject: [PATCH 06/11] new nodes edges functionality --- cellar/README.md | 3 +++ cellar/cellar_extractor/cellar.py | 15 ++++++------- cellar/cellar_extractor/nodes_and_edges.py | 26 +++++++++++++--------- cellar/setup.py | 2 +- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/cellar/README.md b/cellar/README.md index 6447a9d..acb6b80 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -104,6 +104,9 @@ Python 3.9
  • df: DataFrame object, required, default None
  • DataFrame of cellar metadata acquired from the get_cellar_extra method with eurlex webservice credentials passed. This method will only work on dataframes with citations data. +
  • only_local: boolean, optional, default False
  • + Flag for nodes and edges generation. If set to True, the network created will only include nodes and edges between + cases exclusively inside the given dataframe.
  • filter_subject_matter
    • diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index 4dac929..fb197cd 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -74,17 +74,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre return data, json -def get_nodes_and_edges_lists(df=None): +def get_nodes_and_edges_lists(df=None, only_local=False): if df is None: print("No dataframe passed!") return - else: - try: - nodes, edges = get_nodes_and_edges(df) - except: - print('Something went wrong. Nodes and edges creation unsuccessful.') - return False, False - return nodes, edges + try: + nodes, edges = get_nodes_and_edges(df,only_local) + except: + print('Something went wrong. Nodes and edges creation unsuccessful.') + return False, False + return nodes, edges def filter_subject_matter(df=None, phrase=None): diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py index 9578c36..13adf6b 100644 --- a/cellar/cellar_extractor/nodes_and_edges.py +++ b/cellar/cellar_extractor/nodes_and_edges.py @@ -5,7 +5,7 @@ def extract_containing_subject_matter(df,phrase): def get_df_with_celexes(df,celexes): returner = df[df['CELEX IDENTIFIER'].isin(celexes)] return returner -def get_edges_list(df): +def get_edges_list(df,only_local): extraction = df[['CELEX IDENTIFIER','citing']] extraction.reset_index(inplace=True) keys = extraction['CELEX IDENTIFIER'] @@ -15,16 +15,20 @@ def get_edges_list(df): for i in range(len(keys)): k = keys[i] val = vals[i] - if val == val: - nodes.add(str(k)) - val_unpacked = val.split(";") - for val in val_unpacked: - nodes.add(str(val)) - edges.append(str(k)+','+str(val)) - else: - pass + if val != val: + continue + nodes.add(str(k)) + val_unpacked = val.split(";") + for val in val_unpacked: + if only_local and val not in keys: + continue + nodes.add(str(val)) + edges.append(str(k)+','+str(val)) + + nodes = list(nodes) + return edges, list(nodes) -def get_nodes_and_edges(df): - edges, nodes = get_edges_list(df) +def get_nodes_and_edges(df,only_local): + edges, nodes = get_edges_list(df,only_local) #nodes = get_df_with_celexes(df,celexes) return nodes,edges \ No newline at end of file diff --git a/cellar/setup.py b/cellar/setup.py index 2ad4544..0507f5f 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.54', + version='1.0.55', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From cbc244615b0f8af6cf3f0c62f0e924f88c17e934 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 5 Aug 2023 18:11:58 +0200 Subject: [PATCH 07/11] small fix --- cellar/cellar_extractor/cellar.py | 6 ++++-- cellar/setup.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index fb197cd..820acc8 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -91,7 +91,9 @@ def filter_subject_matter(df=None, phrase=None): print("Incorrect input values! \n Returning... \n") else: try: - mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase) + mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase,na=False) return df[mask] - except: + except Exception as e: + print(e) print("Something went wrong!\n Returning... \n") + return None diff --git a/cellar/setup.py b/cellar/setup.py index 0507f5f..66cd9bc 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.55', + version='1.0.57', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From b007556ba25c1f023bfd0a331085171c9b967ea9 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Sat, 12 Aug 2023 14:57:42 +0200 Subject: [PATCH 08/11] small fix --- cellar/cellar_extractor/Testing_file.py | 2 +- cellar/cellar_extractor/cellar.py | 2 +- cellar/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py index 8204c52..412e4e8 100644 --- a/cellar/cellar_extractor/Testing_file.py +++ b/cellar/cellar_extractor/Testing_file.py @@ -24,7 +24,7 @@ text = get_full_text_from_html(site) cits = get_citations_with_extra_info(text) print(cits) - data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n') + data = get_cellar(sd='2023-01-01',max_ecli=100,save_file='n') d3 = filter_subject_matter(data, "prices") b=2 pass \ No newline at end of file diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index 820acc8..3d12be0 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -91,7 +91,7 @@ def filter_subject_matter(df=None, phrase=None): print("Incorrect input values! \n Returning... \n") else: try: - mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase,na=False) + mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase.lower(), na=False) return df[mask] except Exception as e: print(e) diff --git a/cellar/setup.py b/cellar/setup.py index 66cd9bc..31ea5fd 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.57', + version='1.0.58', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From 4e9e4b366b63086839370328655ba96fd1f108eb Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Thu, 17 Aug 2023 12:34:19 +0200 Subject: [PATCH 09/11] update --- cellar/cellar_extractor/nodes_and_edges.py | 4 ++-- cellar/setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py index 13adf6b..087c798 100644 --- a/cellar/cellar_extractor/nodes_and_edges.py +++ b/cellar/cellar_extractor/nodes_and_edges.py @@ -8,8 +8,8 @@ def get_df_with_celexes(df,celexes): def get_edges_list(df,only_local): extraction = df[['CELEX IDENTIFIER','citing']] extraction.reset_index(inplace=True) - keys = extraction['CELEX IDENTIFIER'] - vals = extraction['citing'] + keys = extraction['CELEX IDENTIFIER'].tolist() + vals = extraction['citing'].tolist() nodes = set() edges = list() for i in range(len(keys)): diff --git a/cellar/setup.py b/cellar/setup.py index 31ea5fd..87cbb16 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.58', + version='1.0.59', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From 8116bc6f1fcdc634a9cee2a1bf4ed075b79aab08 Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Wed, 27 Sep 2023 14:46:01 +0200 Subject: [PATCH 10/11] logging for cellar --- cellar/cellar_extractor/cellar.py | 32 +++++++++++----------- cellar/cellar_extractor/citations_adder.py | 15 +++++----- cellar/cellar_extractor/csv_extractor.py | 3 +- cellar/cellar_extractor/json_to_csv.py | 13 +++++---- cellar/setup.py | 2 +- 5 files changed, 34 insertions(+), 31 deletions(-) diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index 3d12be0..b108844 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -3,7 +3,7 @@ import time from datetime import datetime from pathlib import Path - +import logging from tqdm import tqdm from cellar_extractor.cellar_extra_extract import extra_cellar @@ -17,16 +17,16 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma ed = datetime.now().isoformat(timespec='seconds') file_name = 'cellar_' + sd + '_' + ed file_name = file_name.replace(":", "_") - print('\n--- PREPARATION ---\n') - print(f'Starting from specified start date: {sd}') - print(f'Up until the specified end date {ed}') + logging.info('\n--- PREPARATION ---\n') + logging.info(f'Starting from specified start date: {sd}') + logging.info(f'Up until the specified end date {ed}') eclis = get_all_eclis(starting_date=sd, ending_date=ed) - print(f"Found {len(eclis)} ECLIs") + logging.info(f"Found {len(eclis)} ECLIs") time.sleep(1) if len(eclis) > max_ecli: eclis = eclis[:max_ecli] if len(eclis) == 0: - print(f"No data to download found between {sd} and {ed}") + logging.info(f"No data to download found between {sd} and {ed}") return False all_eclis = {} concurrent_docs = 100 @@ -48,7 +48,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma return df else: return all_eclis - print("\n--- DONE ---") + logging.info("\n--- DONE ---") def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""): @@ -56,44 +56,44 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre ed = datetime.now().isoformat(timespec='seconds') data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv') if data is False: - print("Cellar extraction unsuccessful") + logging.warning("Cellar extraction unsuccessful") return False, False - print("\n--- START OF EXTRA EXTRACTION ---") + logging.info("\n--- START OF EXTRA EXTRACTION ---") file_name = 'cellar_extra_' + sd + '_' + ed file_name = file_name.replace(":", "_") file_path = os.path.join('data', file_name + '.csv') if save_file == 'y': Path('data').mkdir(parents=True, exist_ok=True) extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password) - print("\n--- DONE ---") + logging.info("\n--- DONE ---") else: data, json = extra_cellar(data=data, threads=threads, username=username, password=password) - print("\n--- DONE ---") + logging.info("\n--- DONE ---") return data, json def get_nodes_and_edges_lists(df=None, only_local=False): if df is None: - print("No dataframe passed!") + logging.warning("No dataframe passed!") return try: nodes, edges = get_nodes_and_edges(df,only_local) except: - print('Something went wrong. Nodes and edges creation unsuccessful.') + logging.warning('Something went wrong. Nodes and edges creation unsuccessful.') return False, False return nodes, edges def filter_subject_matter(df=None, phrase=None): if df is None or phrase is None: - print("Incorrect input values! \n Returning... \n") + logging.info("Incorrect input values! \n Returning... \n") else: try: mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase.lower(), na=False) return df[mask] except Exception as e: - print(e) - print("Something went wrong!\n Returning... \n") + logging.warning(e) + logging.warning("Something went wrong!\n Returning... \n") return None diff --git a/cellar/cellar_extractor/citations_adder.py b/cellar/cellar_extractor/citations_adder.py index 99de07d..ba47721 100644 --- a/cellar/cellar_extractor/citations_adder.py +++ b/cellar/cellar_extractor/citations_adder.py @@ -1,6 +1,7 @@ import sys import threading import time +import logging from io import StringIO from os.path import dirname, abspath import pandas as pd @@ -111,7 +112,7 @@ def process_queries(link, celex): response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 and "WS_WS_CALLS_IDLE_INTERVAL" not in response.text: perc=i*100/len(celexes) - print(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." + logging.info(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." + f"\nThere were {success} successful queries and {retry} retries") return elif "0" in response.text: @@ -124,7 +125,7 @@ def process_queries(link, celex): failure = True except: retry+=1 - #print(response.content) + #logging.info(response.content) time.sleep(0.5) time.sleep(2) if len(normal_celex)>0: @@ -225,24 +226,24 @@ def add_citations_separate_webservice(data, username, password): response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 : if "WS_MAXIMUM_NB_OF_WS_CALLS" in response.text: - print("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.") + logging.warning("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.") return else: - print("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " + logging.warning("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " + "login credentials are different)") sys.exit(2) elif response.status_code == 403: - print("Webservice connection was blocked, eurlex might be going through maintenance right now.") + logging.info("Webservice connection was blocked, eurlex might be going through maintenance right now.") sys.exit(2) else: - print("Webservice connection was successful!") + logging.info("Webservice connection was successful!") time.sleep(1) dictionary_list = list() execute_citations_webservice(dictionary_list,celex,username,password) citing_dict = dict() for d in dictionary_list: citing_dict.update(d) - print("Webservice extraction finished, the rest of extraction will now happen.") + logging.info("Webservice extraction finished, the rest of extraction will now happen.") time.sleep(1) # It seemed to print out the length of dictionary wrong, even when it was equal to 1000. cited_dict = reverse_citing_dict(citing_dict) diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py index b623aac..d6a4527 100644 --- a/cellar/cellar_extractor/csv_extractor.py +++ b/cellar/cellar_extractor/csv_extractor.py @@ -1,5 +1,6 @@ import glob import argparse +import logging from cellar_extractor.json_to_csv import read_csv """ @@ -11,7 +12,7 @@ def extract_rows(data, number): try: output = data[1:number] except Exception: - print(f"The file does not have {number} entries, returning entire file.") + logging.info(f"The file does not have {number} entries, returning entire file.") output = data return output diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py index 1a1e62e..2781ae2 100644 --- a/cellar/cellar_extractor/json_to_csv.py +++ b/cellar/cellar_extractor/json_to_csv.py @@ -1,6 +1,7 @@ import csv import re import warnings +import logging from bs4 import BeautifulSoup import sys import pandas as pd @@ -90,8 +91,8 @@ def read_csv(file_path): data = pd.read_csv(file_path, sep=",", encoding='utf-8') return data except Exception: - print("Something went wrong when trying to open the csv file!") - print(f" The path to the file was {file_path}") + logging.info("Something went wrong when trying to open the csv file!") + logging.info(f" The path to the file was {file_path}") sys.exit(2) @@ -112,10 +113,10 @@ def json_to_csv_returning(json_data): if final_data: return create_csv_returning(final_data) else: - print("Error creating dataframe. Data is empty.") + logging.info("Error creating dataframe. Data is empty.") return False else: - print("Error reading json file. Please make sure json file exists and contains data.") + logging.info("Error reading json file. Please make sure json file exists and contains data.") return False @@ -125,9 +126,9 @@ def json_to_csv_main(json_data, filepath): if final_data: create_csv(filepath=filepath, encoding="UTF8", data=final_data) else: - print("Error creating CSV file. Data is empty.") + logging.info("Error creating CSV file. Data is empty.") return False else: - print("Error reading json file. Please make sure json file exists and contains data.") + logging.info("Error reading json file. Please make sure json file exists and contains data.") return False return True diff --git a/cellar/setup.py b/cellar/setup.py index 87cbb16..11bb73c 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.59', + version='1.0.60', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', From 560fc4d29c2cc211344952aadc2db5a54b6201af Mon Sep 17 00:00:00 2001 From: Piotr Lewandowski Date: Tue, 3 Oct 2023 10:46:37 +0200 Subject: [PATCH 11/11] logging fix --- cellar/cellar_extractor/__init__.py | 2 ++ cellar/setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py index 7821b0c..39184aa 100644 --- a/cellar/cellar_extractor/__init__.py +++ b/cellar/cellar_extractor/__init__.py @@ -2,3 +2,5 @@ from cellar_extractor.cellar import get_cellar_extra from cellar_extractor.cellar import get_nodes_and_edges_lists from cellar_extractor.cellar import filter_subject_matter +import logging +logging.basicConfig(level=logging.INFO) \ No newline at end of file diff --git a/cellar/setup.py b/cellar/setup.py index 11bb73c..eec4dce 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,7 +10,7 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor']), - version='1.0.60', + version='1.0.61', description='Library for extracting cellar data', author='LawTech Lab', license='MIT',