From 60e561db52c0cd6eeba784176a024129d894824d Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 15 Jul 2023 14:28:32 +0200
Subject: [PATCH 01/11] removing anything non-celllar

---
 .github/workflows/github-actions.yml          |   2 -
 README.md                                     |   3 +-
 RS_citations/README.md                        | 103 -----
 .../__init__.py                               |   1 -
 .../citations_extractor.py                    | 332 ---------------
 .../testing.py                                |  10 -
 RS_citations/setup.py                         |  25 --
 echr/README.md                                | 227 ----------
 echr/echr_extractor/ECHR_html_downloader.py   |  79 ----
 .../echr_extractor/ECHR_metadata_harvester.py | 230 -----------
 .../ECHR_nodes_edges_list_transform.py        | 301 --------------
 echr/echr_extractor/__init__.py               |   1 -
 echr/echr_extractor/clean_ref.py              |   9 -
 echr/echr_extractor/echr.py                   |  96 -----
 echr/echr_extractor/testing_file.py           |  32 --
 echr/setup.py                                 |  26 --
 rechtspraak/README.md                         | 159 -------
 rechtspraak/rechtspraak_extractor/__init__.py |   4 -
 .../rechtspraak_extractor/rechtspraak.py      | 117 ------
 .../rechtspraak_functions.py                  |  44 --
 .../rechtspraak_metadata.py                   | 387 ------------------
 .../rechtspraak_extractor/testing_file.py     |   6 -
 .../rechtspraak_extractor/tests/__init__.py   |  12 -
 .../tests/rechtspraak.py                      | 140 -------
 .../tests/rechtspraak_functions.py            |  41 --
 .../tests/rechtspraak_metadata.py             | 319 ---------------
 rechtspraak/setup.py                          |  25 --
 tests.py                                      |  65 ---
 28 files changed, 2 insertions(+), 2794 deletions(-)
 delete mode 100644 RS_citations/README.md
 delete mode 100644 RS_citations/rechtspraak_citations_extractor/__init__.py
 delete mode 100644 RS_citations/rechtspraak_citations_extractor/citations_extractor.py
 delete mode 100644 RS_citations/rechtspraak_citations_extractor/testing.py
 delete mode 100644 RS_citations/setup.py
 delete mode 100644 echr/README.md
 delete mode 100644 echr/echr_extractor/ECHR_html_downloader.py
 delete mode 100644 echr/echr_extractor/ECHR_metadata_harvester.py
 delete mode 100644 echr/echr_extractor/ECHR_nodes_edges_list_transform.py
 delete mode 100644 echr/echr_extractor/__init__.py
 delete mode 100644 echr/echr_extractor/clean_ref.py
 delete mode 100644 echr/echr_extractor/echr.py
 delete mode 100644 echr/echr_extractor/testing_file.py
 delete mode 100644 echr/setup.py
 delete mode 100644 rechtspraak/README.md
 delete mode 100644 rechtspraak/rechtspraak_extractor/__init__.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak_functions.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/testing_file.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/tests/__init__.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py
 delete mode 100644 rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py
 delete mode 100644 rechtspraak/setup.py

diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
index d5f729f..ea25b00 100644
--- a/.github/workflows/github-actions.yml
+++ b/.github/workflows/github-actions.yml
@@ -18,8 +18,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install cellar-extractor
-          pip install rechtspraak-extractor
-          pip install echr-extractor
      # pip install echr-extractor
       - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
       - run: echo "🖥️ The workflow is now ready to test your code on the runner."
diff --git a/README.md b/README.md
index 7e1d5c9..34ee8a2 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,3 @@
 # extraction_libraries
-Python libraries for extracting from data sources like Rechtspraak, ECHR, Cellar
+Python library for extracting caselaw data from Cellar.
+Full documentation available at [cellar-extractor](https://pypi.org/project/cellar-extractor/).
diff --git a/RS_citations/README.md b/RS_citations/README.md
deleted file mode 100644
index ec640fa..0000000
--- a/RS_citations/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-## Rechtspraak citations
-This library contains a function that aqcuires citation data for Rechtspraak cases using the LIDO.
-
-## Version
-Python 3.9
-
-## Contributors
-
-<!-- readme: contributors,gijsvd -start -->
-<table>
-<tr>
-    <td align="center">
-        <a href="https://github.com/Cloud956">
-            <img src="https://avatars.githubusercontent.com/u/24865274?v=4" width="100;" alt="Cloud956"/>
-            <br />
-            <sub><b>Piotr Lewandowski</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/shashankmc">
-            <img src="https://avatars.githubusercontent.com/u/3445114?v=4" width="100;" alt="shashankmc"/>
-            <br />
-            <sub><b>shashankmc</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/gijsvd">
-            <img src="https://avatars.githubusercontent.com/u/31765316?v=4" width="100;" alt="gijsvd"/>
-            <br />
-            <sub><b>gijsvd</b></sub>
-        </a>
-    </td>
-</tr>
-</table>
-<!-- readme: contributors,gijsvd -end -->
-
-## How to install?
-<code>pip install rechtspraak_citations_extractor</code>
-
-## What are the functions?
-<li><b>Rechtspraak Citations Extractor</b>
-<ol>
-    <li><code>get_citations</code></li>
-    Gets all the data about case law citing/being cited and the legislations cited from the cases 
-    in passed on DataFrame of case metadata. Requires a valid DataFrame object with a column titled 'ecli'. Returns the same Dataframe object,
-     with 3 additional columns containing JSON strings of citation information.
-</ol> </li>
-
-## What are the parameters?
-<ol>
-    <li><strong>get_citations(dataframe = None, username = '', password = '', threads = 2)</strong></li>
-    <strong>Parameters:</strong>
-    <ul>
-        <li><strong>dataframe: Pandas DataFrame object, required</strong></li>
-        A Dataframe object, which must have a column titled 'ecli'. 
-        The code extracts citations for each separate ECLI in the column. 
-        A Dataframe with Rechtspraak data can be aqcuired via the rechtspraak extractor - https://pypi.org/project/rechtspraak-extractor
-        <br>Default: None
-        <li><strong>username: string, required, default ''</strong></li>
-        The username that together with the password can be used to log into LIDO.
-        <li><strong>password: string, required, default ''</strong></li>
-        The password that together with the username can be used to log into LIDO.
-        <li><strong>threads: int, optional, default 1</strong></li>
-        Option for multi-threading of LiDO requests - not recommended to go above 2, as LiDO breaks connections when overwhelmed.
-    </ul>
-</ol>
-
-
-## Examples
-```
-import rechtspraak_extractor as rex
-import rechtspraak_citations_extractor as rex_citations
------------------------------------------------------------------------------------------------------------------------
-
-# To get the rechtspraak data in a dataframe:
-df = rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='y')  # Gets 100 ECLIs from 1st August 2022
-df = get_rechtspraak_metadata(save_file='n',dataframe=df)
-# To get the citations:
-df_with_citaitons = rex_citations.get_citations(df,'username','password')
-```
-
-
-## License
-[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0)
-
-Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0).
-```
-Apache License, Version 2.0
-
-Copyright (c) 2022 Maastricht Law & Tech Lab
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-```
diff --git a/RS_citations/rechtspraak_citations_extractor/__init__.py b/RS_citations/rechtspraak_citations_extractor/__init__.py
deleted file mode 100644
index 9d98e28..0000000
--- a/RS_citations/rechtspraak_citations_extractor/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from rechtspraak_citations_extractor.citations_extractor import get_citations
diff --git a/RS_citations/rechtspraak_citations_extractor/citations_extractor.py b/RS_citations/rechtspraak_citations_extractor/citations_extractor.py
deleted file mode 100644
index 8810807..0000000
--- a/RS_citations/rechtspraak_citations_extractor/citations_extractor.py
+++ /dev/null
@@ -1,332 +0,0 @@
-import requests
-from lxml import etree
-import urllib.request
-import rdflib
-import threading
-import json
-import pandas as pd
-from dotenv import load_dotenv
-from requests.auth import HTTPBasicAuth
-from tqdm import tqdm
-load_dotenv()
-
-LIDO_ENDPOINT = "http://linkeddata.overheid.nl/service/get-links"
-
-target_ecli = 'target_ecli'
-label = 'label'
-type = 'type'
-ecli = 'ecli'
-case_citations_fieldnames = [target_ecli, label, type]
-legislation_citations_fieldnames = ['legal_provision_url_lido', 'legal_provision_url', 'legal_provision']
-
-
-def remove_spaces_from_ecli(ecli):
-    return ecli.replace(" ", "")
-
-
-def write_incremental_rows(filename, data):
-    with open(filename, 'a') as f:
-        pd.DataFrame(data).to_csv(f, mode='a', header=not f.tell(), index=False)
-
-
-# Code to execute LIDO API call
-def get_lido_response(url, username, password):
-    authentication = HTTPBasicAuth(username, password)
-    response = requests.get(url, auth=authentication)
-    if response.status_code == 200:
-        return response.text
-    else:
-        raise Exception('LinkedData responded with code {}: {}. {}'.format(response.status_code, response.reason, url))
-
-
-# Extract the ECLI code from the LIDO identifier of the cited case law from the XML response from LIDO API
-def get_ecli(sub_ref):
-    return sub_ref.attrib['idref'].split('/')[-1]
-
-
-# Extract the LIDO identifier of the cited legislation from the XML response from LIDO API
-def get_legislation_identifier(sub_ref):
-    return sub_ref.attrib['idref']
-
-
-# Find the webpage expressing, in writing, the legislation referred to by the input LIDO identifier
-def get_legislation_webpage(identifier):
-    idcomponents = identifier.split("/")
-    date = idcomponents[len(idcomponents) - 1]
-    url = identifier
-    page = urllib.request.urlopen(url)
-    g = rdflib.Graph()
-    g.parse(page, format="xml")
-    article = ""
-    for s, p, o in g:
-        if str(p) == "http://purl.org/dc/terms/identifier":
-            article = o
-            if date in str(o):
-                return o
-
-    return article
-
-
-def get_legislation_name(url, username, password):
-    # turn the response into an xml tree
-    xml_response = get_lido_response(url, username, password)
-    xml = etree.fromstring(bytes(xml_response, encoding='utf8'))
-
-    pref_label = ""
-    title = ""
-    # RDF main element (root)
-    for element in xml.iterchildren():
-        # there is only one child and it is the "description" in which the rest of the info is
-        # go through all the tags (all the info)
-        for el in element.iterchildren():
-            # the title (same thing as the preLabel) is the feature we want to be using
-            if el.tag == "{http://purl.org/dc/terms/}title":
-                title = el.text
-
-    return title
-
-
-# Check if outgoing links in the XML response from the LIDO API are of type "Jurisprudentie" (case law)
-def is_case_law(sub_ref):
-    return sub_ref.attrib['groep'] == 'Jurisprudentie'
-
-
-# Check if outgoing links in the XML response from the LIDO API are of type "Wet" (legislation)
-def is_legislation(sub_ref):
-    return sub_ref.attrib['groep'] == 'Wet' or sub_ref.attrib['groep'] == 'Artikel'
-
-
-# Extract ECLI code of citation from a lido identifier.
-# Example of a LIDO identifier "https://linkeddata.overheid.nl/terms/bwb/id/BWBR0020368/8655654/2016-08-11/2016-08-11"
-def get_lido_id(ecli):
-    return "http://linkeddata.overheid.nl/terms/jurisprudentie/id/" + ecli
-
-
-# Method written by Marion
-"""
-These methods are used to write the citations incrementally to the csv file (in case it crashes or times out). 
-It allows us to stop the script whenever we want without loosing our data, and without having to start from the bginning the next time. 
-"""
-
-
-# Main method to execute LIDO API call on a list of ECLIs from a CSV file and extract the citations of each
-# Add the implementation of the incremental writing of rows
-def find_citations_for_cases(dataframe, username, password):
-    df_eclis = dataframe.reset_index(drop=True)
-
-    eclis = list(df_eclis['ecli'].dropna())
-    total_incoming = []
-    total_outgoing = []
-    total_legislations = []
-
-    for i, ecli in enumerate(eclis):
-        case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case(
-            remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username,
-            password)
-        if case_citations_incoming:
-            total_incoming.extend(case_citations_incoming)
-        if case_citations_outgoing:
-            total_outgoing.extend(case_citations_outgoing)
-        if legislation_citations:
-            total_legislations.extend(legislation_citations)
-    df_incoming = pd.DataFrame(total_incoming)
-    df_outgoing = pd.DataFrame(total_outgoing)
-    df_legislations = pd.DataFrame(total_legislations)
-    return df_incoming, df_outgoing, df_legislations
-
-
-def citations_multithread_single(big_incoming, big_outgoing, big_legislations, ecli, username, password, current_index,bar):
-    incoming_df = pd.Series([], dtype='string')
-    outgoing_df = pd.Series([], dtype='string')
-    legislations_df = pd.Series([], dtype='string')
-    for i, ecli in enumerate(ecli):
-        index = current_index + i
-        case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case(
-            remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username,
-            password)
-        if case_citations_incoming:
-            encoded = json.dumps(case_citations_incoming)
-            incoming_df[index] = encoded
-        if case_citations_outgoing:
-            encoded = json.dumps(case_citations_outgoing)
-            outgoing_df[index] = encoded
-        if legislation_citations:
-            encoded = json.dumps(legislation_citations)
-            legislations_df[index] = encoded
-        bar.update(1)
-    big_incoming.append(incoming_df)
-    big_outgoing.append(outgoing_df)
-    big_legislations.append(legislations_df)
-
-
-def add_column_frow_list(data, name, list):
-    column = pd.Series([], dtype='string')
-    for l in list:
-        column = column.append(l)
-    column.sort_index(inplace=True)
-    data.insert(1, name, column)
-
-
-def find_citations_for_cases_multithread(dataframe, username, password, threads):
-    ecli = dataframe['ecli'].dropna().reset_index(drop=True)
-    length = ecli.size
-    at_once_threads = int(length / threads)
-    big_incoming = []
-    big_outgoing = []
-    big_legislations = []
-    threads = []
-    bar = tqdm(total=length, colour="GREEN",position=0, leave=True,miniters=int(length/100),maxinterval=10000)
-    for i in range(0, length, at_once_threads):
-        curr_ecli = ecli[i:(i + at_once_threads)]
-        t = threading.Thread(target=citations_multithread_single,
-                             args=[big_incoming, big_outgoing, big_legislations, curr_ecli, username, password, i,bar])
-        threads.append(t)
-    for t in threads:
-        t.start()
-    for t in threads:
-        t.join()
-    add_column_frow_list(dataframe, 'citations_incoming', big_incoming)
-    add_column_frow_list(dataframe, 'citations_outgoing', big_outgoing)
-    add_column_frow_list(dataframe, 'legislations_cited', big_legislations)
-    return dataframe
-
-
-def add_citations_no_duplicates(already_existing_list, element):
-    duplicate = False
-    new_ecli = get_ecli(element)
-    added_sth_new = True
-    for stored in already_existing_list:
-        if stored[target_ecli] == new_ecli:
-            added_sth_new = False
-            duplicate = True
-            break
-    if not duplicate:
-        already_existing_list.append({target_ecli: new_ecli,
-                                      label: element.attrib['label'],
-                                      type: element.attrib['type'].split('/id/')[1]})
-    return added_sth_new
-
-
-def add_legislations_no_duplicates(list, element):
-    duplicate = False
-    new_legislation = get_legislation_identifier(element)
-    added_sth_new = True
-    for legs in list:
-        if new_legislation == legs:
-            added_sth_new = False
-            duplicate = True
-            break
-    if not duplicate:
-        list.append(get_legislation_identifier(element))
-    return added_sth_new
-
-
-# Main method to execute LIDO API call on the ECLI code of the input case and extract the citations
-def find_citations_for_case(ecli, case_citations_fieldnames, legislation_citations_fieldnames, username, password):
-    xml_elements = []
-    case_law_citations_outgoing = []
-    legislation_citations = []
-    case_law_citations_incoming = []
-    start_page = 0
-    end_of_pages = False
-    outgoing = "uitgaande-links"
-    incoming = "inkomende-links"
-
-    while not end_of_pages:
-        added_sth_new = False
-        url = "{}?id={}&start={}&rows={}&output=xml".format(LIDO_ENDPOINT, get_lido_id(ecli), start_page, 100)
-        start_page += 1
-
-        xml_text = get_lido_response(url, username, password)
-        xml_elements.append(etree.fromstring(xml_text.encode('utf8')))
-
-        for el in xml_elements:
-
-            for sub in list(el.iterchildren('subject')):
-
-                for the_citations in sub.iterchildren(outgoing):
-                    for sub_ref in the_citations.iterchildren():
-                        if is_case_law(sub_ref):
-                            added_sth_new = add_citations_no_duplicates(case_law_citations_outgoing, sub_ref)
-                        elif is_legislation(sub_ref):
-                            added_sth_new = add_legislations_no_duplicates(legislation_citations, sub_ref)
-
-                for the_citations in sub.iterchildren(incoming):
-                    for sub_ref in the_citations.iterchildren():
-                        if is_case_law(sub_ref):
-                            added_sth_new = add_citations_no_duplicates(case_law_citations_incoming, sub_ref)
-
-        if not added_sth_new or start_page > 15:
-            #print(start_page)
-            end_of_pages = True
-
-    # Remove duplicates empties
-
-    for item in case_law_citations_incoming:
-        if item[target_ecli] == "":
-            case_law_citations_incoming.remove(item)
-    for item in case_law_citations_outgoing:
-        if item[target_ecli] == "":
-            case_law_citations_outgoing.remove(item)
-
-    # Remove input case ECLI (for some reason a case can cite itself...)
-    for dicts in case_law_citations_incoming:
-        if dicts[target_ecli] == remove_spaces_from_ecli(ecli):
-            case_law_citations_incoming.remove(dicts)
-            break
-    for dicts in case_law_citations_outgoing:
-        if dicts[target_ecli] == remove_spaces_from_ecli(ecli):
-            case_law_citations_outgoing.remove(dicts)
-            break
-    if (remove_spaces_from_ecli(ecli) in case_law_citations_incoming):
-        case_law_citations_incoming.remove(remove_spaces_from_ecli(ecli))
-
-    case_law_result_outgoing = extract_results_citations(case_law_citations_outgoing, ecli, case_citations_fieldnames)
-    case_law_results_incoming = extract_results_citations(case_law_citations_incoming, ecli, case_citations_fieldnames)
-    legislation_results = extract_results_legislations(legislation_citations, ecli, legislation_citations_fieldnames,
-                                                       username, password)
-
-    return case_law_results_incoming, case_law_result_outgoing, legislation_results
-
-
-def extract_results_citations(list, ecli, fields):
-    list_of_all_results = []
-
-    for case_citation in list:
-        case_law_result = {key: None for key in fields}
-        case_law_result[fields[0]] = (remove_spaces_from_ecli(case_citation[target_ecli]))  # Target ECLI
-        case_law_result[fields[1]] = (case_citation['label'])  # Target ECLI
-        case_law_result[fields[2]] = (case_citation['type'])  # Target ECLI
-        list_of_all_results.append(case_law_result)
-    return list_of_all_results
-
-
-def extract_results_legislations(list, ecli, fields, username, password):
-    list_of_all_results = []
-
-    for leg_citation in list:
-        legislation_result = {key: None for key in fields}
-        legislation_result[fields[0]] = (leg_citation)  # Target article
-        legislation_result[fields[1]] = (get_legislation_webpage(leg_citation))  # Target article webpage
-        legislation_result[fields[2]] = (
-            get_legislation_name(leg_citation, username, password))  # pref label == article name
-        list_of_all_results.append(legislation_result)
-    return list_of_all_results
-
-
-def get_citations(dataframe=None, username="", password="", threads=1):
-    if dataframe is None or not username or not password:
-        print("Incorrect arguments passed. Returning...")
-        return False
-    try:
-        get_lido_response(LIDO_ENDPOINT,username,password)
-    except:
-        print('LIDO cannot be accessed with these login details. Returning...')
-        return False
-    print('\n--- START OF RS CITATIONS EXTRACTIONS ---\n')
-
-    # find citations, and save the file incrementally
-    df = find_citations_for_cases_multithread(dataframe, username, password, threads)
-
-    print("\n--- DONE ---")
-    return df
diff --git a/RS_citations/rechtspraak_citations_extractor/testing.py b/RS_citations/rechtspraak_citations_extractor/testing.py
deleted file mode 100644
index 38f1cc7..0000000
--- a/RS_citations/rechtspraak_citations_extractor/testing.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import pandas as pd
-
-
-from citations_extractor import get_citations
-
-if __name__ == '__main__':
-    name = 'rechtspraak_2018-01-01_2023-06-02_17-45-29_metadata.csv'
-    data = pd.read_csv(name)
-    df = get_citations(data,'','',2)
-    b=2
\ No newline at end of file
diff --git a/RS_citations/setup.py b/RS_citations/setup.py
deleted file mode 100644
index 90ba586..0000000
--- a/RS_citations/setup.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# This file is required to create a python library
-
-from setuptools import find_packages, setup
-from pathlib import Path
-
-p = Path("README.md")
-long_descr = p.read_text()
-
-setup(
-    name='rechtspraak_citations_extractor',
-    packages=find_packages(include=['rechtspraak_citations_extractor']),
-    version='1.0.8',
-    description='Library for extracting rechtspraak citations via LIDO',
-    author='LawTech Lab',
-    license='MIT',
-    install_requires=['requests>=2.26.0', 'python_dotenv==0.15.0', 'pandas >=1.2.5','urllib3>=1.26.12','lxml>=4.6.3','tqdm'],
-    author_email='p.lewandowski@student.maastrichtuniversity.nl',
-    keywords=['rechtspraak', 'citations', 'rechtspraak citations', 'RS citations'],
-    long_description=long_descr,
-    long_description_content_type='text/markdown',
-    project_urls={
-        "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries",
-        "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries",
-    },
-)
\ No newline at end of file
diff --git a/echr/README.md b/echr/README.md
deleted file mode 100644
index 89558b6..0000000
--- a/echr/README.md
+++ /dev/null
@@ -1,227 +0,0 @@
-## echr extractor
-This library contains functions to get ECHR data.
-
-## Version
-Python 3.9
-
-## Contributors
-
-<!-- readme: contributors,gijsvd -start -->
-<table>
-<tr>
-    <td align="center">
-        <a href="https://github.com/brodriguesdemiranda">
-            <img src="https://avatars.githubusercontent.com/u/35369949?v=4" width="100;" alt="brodriguesdemiranda"/>
-            <br />
-            <sub><b>Benjamin Rodrigues de Miranda</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/ChloeCro">
-            <img src="https://avatars.githubusercontent.com/u/99276050?v=4" width="100;" alt="ChloeCro"/>
-            <br />
-            <sub><b>Chloe Crombach</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/Cloud956">
-            <img src="https://avatars.githubusercontent.com/u/24865274?v=4" width="100;" alt="Cloud956"/>
-            <br />
-            <sub><b>Piotr Lewandowski</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/pranavnbapat">
-            <img src="https://avatars.githubusercontent.com/u/7271334?v=4" width="100;" alt="pranavnbapat"/>
-            <br />
-            <sub><b>Pranav Bapat</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/running-machin">
-            <img src="https://avatars.githubusercontent.com/u/60750154?v=4" width="100;" alt="running-machin"/>
-            <br />
-            <sub><b>running-machin</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/shashankmc">
-            <img src="https://avatars.githubusercontent.com/u/3445114?v=4" width="100;" alt="shashankmc"/>
-            <br />
-            <sub><b>shashankmc</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/gijsvd">
-            <img src="https://avatars.githubusercontent.com/u/31765316?v=4" width="100;" alt="gijsvd"/>
-            <br />
-            <sub><b>gijsvd</b></sub>
-        </a>
-    </td>
-</tr>
-</table>
-<!-- readme: contributors,gijsvd -end -->
-
-## How to install?
-<code>pip install echr-extractor</code>
-
-## What are the functions?
-<ol>
-    <li><code>get_echr</code></li>
-     Gets all of the available metadata for echr cases from the HUDOC database.
-    Can be saved in a file or returned in-memory.
-<br>
-    <li><code>get_echr_extra</code></li>
-    Gets all of the available metadata for echr cases from the HUDOC database. 
-On top of that downloads the full text for each case downloaded. Can be saved in a file or returned in-memory.
-<br>
-    <li><code>get_nodes_edges</code></li>
-    Gets all of the available nodes and edges for echr cases for given metadata from the HUDOC database.
-</ol>
-
-## What are the parameters?
-<ol>
-    <li><code>get_echr</code></li> 
-    <ul>
-        <li><strong>start_id: int, optional, default: 0</strong></li>
-        The id of the first case to be downloaded.
-         <li><strong>end_id: int, optional, default: The maximum number of cases available</strong></li>
-        The id of the last case to be downloaded.
-        <li><strong>count: int, optional, default: None </strong></li>
-        The number of cases per language to be downloaded, starting from the start_id. 
-        <br><strong>!NOTICE!</strong><br>
-        If count is provided, the end_id will be set to start_id+count, overwriting any given end_id value.
-        <li><strong>start_date: date, optional, default None</strong></li>
-        The start publication date (yyyy-mm-dd)
-        <li><strong>end_date: date, optional, default current date</strong></li>
-        The end publication date (yyyy-mm-dd)
-        <li><strong>verbose: boolean, optional, default False</strong></li>
-        This option allows for additional printing, showing live progress of the extraction process.
-        <li><strong>skip_missing_dates: boolean, optional, default False</strong></li>
-        This option makes the extraction not collect data for cases where there is no judgement date provided.
-        <li><strong>fields: list of strings, optional, default all available fields</strong></li>
-        This argument can be provided, to limit the metadata to be downloaded. These fields will appear as 
-        different columns in the csv file / Dataframe object. The full list of fields is attached in the appendix.
-        <li><strong>save_file: ['y', 'n'],optional, default 'y'</strong></li>
-        Save metadata as a csv file in the data folder, or return as a Pandas DataFrame object in-memory.
-        <li><strong>link: string ,optional, default None </strong></li>
-        Allows the user to download results of a search from the HUDOC website. If this argument is provided, all
-        the other arguments are ignored, except for 'fields'. Further information on proper usage is in the Appendix.
-        <li><strong>language: list of strings, optional, default ['ENG']</strong></li>
-        The language of the metadata to be downloaded from the available languages.
-        <br><strong>!NOTICE!</strong><br>
-        If link is provided, the language argument will not be used, as the language also appears in the link.
-</ul>
-    <li><code>get_echr_extra</code></li>
-    <ul> 
-        <li><strong>start_id: int, optional, default: 0</strong></li>
-        The id of the first case to be downloaded.
-        <li><strong>end_id: int, optional, default: The maximum number of cases available</strong></li>
-        The id of the last case to be downloaded.
-        <li><strong>count: int, optional, default: None </strong></li>
-        The number of cases per language given as input to be downloaded, starting from the start_id. 
-        <br><strong>!NOTICE!</strong><br>
-        If count is provided, the end_id will be set to start_id+count, overwriting any given end_id value.
-        <li><strong>start_date: date, optional, default None</strong></li>
-        The start publication date (yyyy-mm-dd)
-        <li><strong>end_date: date, optional, default current date</strong></li>
-        The end publication date (yyyy-mm-dd)
-        <li><strong>verbose: boolean, optional, default False</strong></li>
-        This option allows for additional printing, showing live progress of the extraction process.
-        <li><strong>skip_missing_dates: boolean, optional, default False</strong></li>
-        This option makes the extraction not collect data for cases where there is no judgement date provided.
-        <li><strong>fields: list of strings, optional, default all available fields</strong></li>
-        This argument can be provided, to limit the metadata to be downloaded. These fields will appear as 
-        different columns in the csv file / Dataframe object. The full list of fields is attached in the appendix.
-        <li><strong>save_file: ['y', 'n'],optional, default 'y'</strong></li>
-        Save metadata as a csv file in the data folder and the full_text as a json file, 
-        or return a Pandas DataFrame object and a list of dictionaries in-memory.
-        <li><strong>language: list of strings, optional, default ['ENG']</strong></li>
-        The language of the metadata to be downloaded from the available languages.
-        <br><strong>!NOTICE!</strong><br>
-        If link is provided, the language argument will not be used, as the language also appears in the link.
-        <li><strong>link: string ,optional, default None </strong></li>
-        Allows the user to download results of a search from the HUDOC website. If this argument is provided, all
-        the other arguments are ignored, except for 'fields'. Further information on proper usage is in the Appendix.
-        <li><strong>threads: int, optional, default: 10</strong></li>
-        The full text download is a parallelizable process.
-        This parameter determines the number of threads to be used in the download.
-    </ul>
-    <li><code>get_nodes_edges</code></li>
-    <ul>
-        <li><strong>metadata_path</strong></li>
-        The path to the metadata file to read.
-        <li><strong>save_file: ['y', 'n'],optional, default 'y'</strong></li>
-        Save the nodes and edges of cases in metadata as csv files in the data folder, or return them as Pandas Dataframe objects in-memory.
-    </ul>
-</ol>
-
-## Examples
-
-```
-import echr_extractor as echr
-
-Below are examples for in-file saving:
-
-df, json = echr.get_echr_extra(count=100,save_file='y',threads=10)
-df = echr.get_echr(start_id=1,save_file='y',skip_missing_dates=True)
-
-Below are examples for in-memory saving:
-
-df, json = echr.get_echr_extra(start_id=20,end_id=3000,save_file='n')
-    
-df = echr.get_echr(start_id=1000,count=2000,save_file='n',verbose=True)
-
-nodes, edges = echr.get_nodes_edges(metadata_path='data/echr_metadata.csv',save_file='n')
-```
-```
-
-## License
-[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0)
-
-Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0).
-```
-Apache License, Version 2.0
-
-Copyright (c) 2022 Maastricht Law & Tech Lab
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-```
-
-
-## Appendix
-
-```
-To properly use the 'link' parameter of the extraction methods, the user should head to 
-
-https://hudoc.echr.coe.int/eng#%20
-
-There, the user can use the tools of Advanced Search of HUDOC to search for specific cases.
-Afterwards*, the user can copy the link of the current website, and pass it on to the extraction methods. 
-
-
-* It should be noted that the link only updates after the 'search' button  of the Advanced Search is clicked.
-
-
-
-The full list of fields is as follows:
-
-fields = ['itemid','applicability','application','appno','article','conclusion','decisiondate','docname',
-'documentcollectionid','documentcollectionid2','doctype','doctypebranch','ecli','externalsources','extractedappno',
-'importance','introductiondate','isplaceholder','issue','judgementdate','kpdate','kpdateAsText','kpthesaurus',
-'languageisocode','meetingnumber','originatingbody','publishedby','Rank','referencedate','reportdate','representedby',
-'resolutiondate',resolutionnumber','respondent','respondentOrderEng','rulesofcourt','separateopinion','scl',
-'sharepointid','typedescription','nonviolation','violation']
-
-```
-These fields can take different values, for more information head to https://hudoc.echr.coe.int.
\ No newline at end of file
diff --git a/echr/echr_extractor/ECHR_html_downloader.py b/echr/echr_extractor/ECHR_html_downloader.py
deleted file mode 100644
index f857de8..0000000
--- a/echr/echr_extractor/ECHR_html_downloader.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from bs4 import BeautifulSoup
-import requests
-import threading
-
-base_url = 'https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id='
-
-
-def get_full_text_from_html(html_text):
-    # This method turns the html code from the summary page into text
-    # It has different cases depending on the first character of the CELEX ID
-    # Should only be used for summaries extraction
-    soup = BeautifulSoup(html_text, "html.parser")
-    for script in soup(["script", "style"]):
-        script.extract()  # rip it out
-    text = soup.get_text()
-    # break into lines and remove leading and trailing space on each
-    lines = (line.strip() for line in text.splitlines())
-    # break multi-headlines into a line each
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    # drop blank lines
-    text = '\n'.join(chunk for chunk in chunks if chunk)
-    text = text.replace(",", "_")
-    return text
-
-
-def download_full_text_main(df, threads):
-    item_ids = df['itemid']
-    eclis = df['ecli']
-    length = item_ids.size
-    if length>threads:
-        at_once_threads = int(length / threads)
-    else:
-        at_once_threads=length
-    all_dict = list()
-    threads = []
-    for i in range(0, length, at_once_threads):
-        curr_ids = item_ids[i:(i + at_once_threads)]
-        curr_ecli = eclis[i:(i + at_once_threads)]
-        t = threading.Thread(target=download_full_text_separate, args=(curr_ids,curr_ecli, all_dict))
-        threads.append(t)
-    for t in threads:
-        t.start()
-    for t in threads:
-        t.join()
-
-    json_file=list()
-    for l in all_dict:
-        if len(l)>0:
-            json_file.extend(l)
-    return json_file
-
-
-def download_full_text_separate(item_ids,eclis, dict_list):
-    full_list = []
-    eclis = eclis.reset_index(drop=True)
-    item_ids = item_ids.reset_index(drop=True)
-    def download_html(item_ids,eclis):
-        retry_ids = []
-        retry_eclis = []
-        for i in range(len(item_ids)):
-            item_id=item_ids[i]
-            ecli=eclis[i]
-            try:
-                r = requests.get(base_url + item_id, timeout=1)
-                json_dict={
-                    'item_id': item_id,
-                    'ecli': ecli,
-                    'full_text': get_full_text_from_html(r.text)
-                }
-                full_list.append(json_dict)
-            except Exception:
-                retry_ids.append(item_id)
-                retry_eclis.append(ecli)
-        return retry_ids, retry_eclis
-
-    retry_ids, retry_eclis = download_html(item_ids, eclis)
-    download_html(retry_ids, retry_eclis)
-    dict_list.append(full_list)
-
diff --git a/echr/echr_extractor/ECHR_metadata_harvester.py b/echr/echr_extractor/ECHR_metadata_harvester.py
deleted file mode 100644
index e93b661..0000000
--- a/echr/echr_extractor/ECHR_metadata_harvester.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import requests
-from datetime import datetime
-import pandas as pd
-
-
-def get_r(url, timeout, retry, verbose):
-    """
-    Get data from a URL. If this is uncuccessful it is attempted again up to a number of tries
-    given by retry. If it is still unsuccessful the batch is skipped.
-    :param str url: The data source URL.
-    :param double timeout: The amount of time to wait for a response each attempt.
-    :param int retry: The number of times to retry upon failure.
-    :param bool verbose: Whether or not to print extra information.
-    """
-    count = 0
-    max_attempts = 20
-    while count < max_attempts:
-        try:
-            r = requests.get(url, timeout=timeout)
-            return r
-        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
-            count += 1
-            if verbose:
-                print(f"Timeout. Retry attempt {count}.")
-            if count > retry:
-                if verbose:
-                    print(f"Unable to connect to {url}. Skipping this batch.")
-                return None
-    return None
-
-
-def basic_function(term, values):
-    values = ['"' + i + '"' for i in values]
-    main_body = list()
-    cut_term = term.replace('"', '')
-    for v in values:
-        main_body.append(f"({cut_term}={v}) OR ({cut_term}:{v})")
-    query = f"({' OR '.join(main_body)})"
-    return query
-
-
-def link_to_query(link):
-    extra_cases_map = {
-        "bodyprocedure": '("PROCEDURE" ONEAR(n=1000) terms OR "PROCÉDURE" ONEAR(n=1000) terms)',
-        "bodyfacts": '("THE FACTS" ONEAR(n=1000) terms OR "EN FAIT" ONEAR(n=1000) terms)',
-        "bodycomplaints": '("COMPLAINTS" ONEAR(n=1000) terms OR "GRIEFS" ONEAR(n=1000) terms)',
-        "bodylaw": '("THE LAW" ONEAR(n=1000) terms OR "EN DROIT" ONEAR(n=1000) terms)',
-        "bodyreasons": '("FOR THESE REASONS" ONEAR(n=1000) terms OR "PAR CES MOTIFS" ONEAR(n=1000) terms)',
-        "bodyseparateopinions": '(("SEPARATE OPINION" OR "SEPARATE OPINIONS") ONEAR(n=5000) terms OR "OPINION '
-                                'SÉPARÉE" ONEAR(n=5000) terms)',
-        "bodyappendix": '("APPENDIX" ONEAR(n=1000) terms OR "ANNEXE" ONEAR(n=1000) terms)'
-    }
-
-    def full_text_function(term, values):
-        return f"({','.join(values)})"
-
-    def date_function(term, values):
-        values = ['"' + i + '"' for i in values]
-        query = '(kpdate >= "first_term" AND kpdate <= "second_term")'
-        query = query.replace("first_term", values[0])
-        query = query.replace("second_term", values[1])
-        return query
-
-    def advanced_function(term, values):
-        body = extra_cases_map.get(term)
-        query = body.replace("terms", ",".join(vals))
-        return query
-
-    query_map = {
-        "docname": basic_function,
-        "appno": basic_function,
-        "scl": basic_function,
-        "rulesofcourt": basic_function,
-        "applicability": basic_function,
-        "ecli": basic_function,
-        "conclusion": basic_function,
-        "resolutionnumber": basic_function,
-        "separateopinions": basic_function,
-        "externalsources": basic_function,
-        "kpthesaurus": basic_function,
-        "advopidentifier": basic_function,
-        "documentcollectionid2": basic_function,
-        "fulltext": full_text_function,
-        "kpdate": date_function,
-        "bodyprocedure": advanced_function,
-        "bodyfacts": advanced_function,
-        "bodycomplaints": advanced_function,
-        "bodylaw": advanced_function,
-        "bodyreasons": advanced_function,
-        "bodyseparateopinions": advanced_function,
-        "bodyappendix": advanced_function,
-        "languageisocode": basic_function
-
-    }
-    start = link.index("{")
-    link_dictionary = eval(link[start:])
-    base_query = 'https://hudoc.echr.coe.int/app/query/results?query=contentsitename:ECHR' \
-                 ' AND (NOT (doctype=PR OR doctype=HFCOMOLD OR doctype=HECOMOLD)) AND ' \
-                 'inPutter&select={select}&sort=itemid%20Ascending&start={start}&length={length}'
-    query_elements = list()
-    for key in list(link_dictionary.keys()):
-        vals = link_dictionary.get(key)
-        funct = query_map.get(key)
-        query_elements.append(funct(key, vals))
-    query_total = ' AND '.join(query_elements)
-    final_query = base_query.replace('inPutter', query_total)
-    # print(final_query)
-    # page = requests.get(final_query)
-    # results = eval(page.text)
-    # print(results.get('resultcount'))
-    return final_query
-
-
-def get_echr_metadata(start_id, end_id, verbose, fields, start_date, end_date, link, language):
-    """
-    Read ECHR metadata into a Pandas DataFrame.
-    :param int start_id: The index to start the search from.
-    :param int end_id: The index to end search at, where the default fetches all results.
-    :param date start_date: The point from which to save cases.
-    :param date end_date: The point before which to save cases.
-    :param bool verbose: Whether or not to print extra information.
-    """
-    data = []
-    if not fields:
-        fields = ['itemid', 'applicability', 'appno', 'article', 'conclusion', 'docname',
-                  'doctype', 'doctypebranch', 'ecli', 'importance', 'judgementdate',
-                  'languageisocode', 'originatingbody', 'violation', 'nonviolation',
-                  'extractedappno', 'scl', 'publishedby', 'representedby', 'respondent',
-                  'separateopinion', 'sharepointid', 'externalsources', 'issue', 'referencedate',
-                  'rulesofcourt', 'DocId', 'WorkId', 'Rank', 'Author', 'Size', 'Path',
-                  'Description', 'Write', 'CollapsingStatus', 'HighlightedSummary',
-                  'HighlightedProperties', 'contentclass', 'PictureThumbnailURL',
-                  'ServerRedirectedURL', 'ServerRedirectedEmbedURL', 'ServerRedirectedPreviewURL',
-                  'FileExtension', 'ContentTypeId', 'ParentLink', 'ViewsLifeTime', 'ViewsRecent',
-                  'SectionNames', 'SectionIndexes', 'SiteLogo', 'SiteDescription', 'deeplinks',
-                  'SiteName', 'IsDocument', 'LastModifiedTime', 'FileType', 'IsContainer',
-                  'WebTemplate', 'SecondaryFileExtension', 'docaclmeta', 'OriginalPath',
-                  'EditorOWSUSER', 'DisplayAuthor', 'ResultTypeIdList', 'PartitionId', 'UrlZone',
-                  'AAMEnabledManagedProperties', 'ResultTypeId', 'rendertemplateid']
-    if link:
-        META_URL = link_to_query(link)
-
-    else:
-        META_URL = 'http://hudoc.echr.coe.int/app/query/results' \
-                   '?query=(contentsitename=ECHR) AND ' \
-                   '(documentcollectionid2:"JUDGMENTS" OR ' \
-                   'documentcollectionid2:"COMMUNICATEDCASES" OR ' \
-                   'documentcollectionid2:"DECISIONS" OR ' \
-                   'documentcollectionid2:"CLIN") AND ' \
-                   'lang_inputter' \
-                   '&select={select}' + \
-                   '&sort=itemid Ascending' + \
-                   '&start={start}&length={length}'
-
-        # An example url: "https://hudoc.echr.coe.int/app/query/results?query=(contentsitename=ECHR)%20AND%20(documentcollectionid2:%22JUDGMENTS%22%20OR%20documentcollectionid2:%22COMMUNICATEDCASES%22%20OR%20documentcollectionid2:%22DECISIONS%22%20OR%20documentcollectionid2:%22CLIN%22)&select=itemid,applicability,application,appno,article,conclusion,decisiondate,docname,documentcollectionid,%20documentcollectionid2,doctype,doctypebranch,ecli,externalsources,extractedappno,importance,introductiondate,%20isplaceholder,issue,judgementdate,kpdate,kpdateAsText,kpthesaurus,languageisocode,meetingnumber,%20originatingbody,publishedby,Rank,referencedate,reportdate,representedby,resolutiondate,%20resolutionnumber,respondent,respondentOrderEng,rulesofcourt,separateopinion,scl,sharepointid,typedescription,%20nonviolation,violation&sort=itemid%20Ascending&start=0&length=200"
-
-        if start_date and end_date:
-            addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")'
-        elif start_date:
-            end_date = datetime.today().date()
-            addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")'
-        elif end_date:
-            start_date = '1900-01-01'
-            addition = f'(kpdate>="{start_date}" AND kpdate<="{end_date}")'
-        else:
-            addition = ''
-
-        if addition:
-            META_URL = META_URL.replace('(contentsitename=ECHR)', '(contentsitename=ECHR) AND ' + addition)
-
-    META_URL = META_URL.replace(' ', '%20')
-    META_URL = META_URL.replace('"', '%22')
-    language_input = basic_function('languageisocode', language)
-    if not link:
-        META_URL = META_URL.replace('lang_inputter', language_input)
-                                    
-    META_URL = META_URL.replace('{select}', ','.join(fields))
-
-
-
-    url = META_URL.format(start=0, length=1)
-    print(url)
-    r = requests.get(url)
-    resultcount = r.json()['resultcount']
-    print("available results: ", resultcount)
-
-    if not end_id:
-        end_id = resultcount
-    if verbose:
-        print(f'Fetching {end_id - start_id} results from index {start_id} to index {end_id} '
-              f'{f" and filtering cases after {start_date}" if start_date else ""} {f"and filtering cases before {end_date}" if end_date else "."}')
-
-    timeout = 6
-    retry = 3
-    if start_id + end_id > 500:  # HUDOC does not let you fetch more than 500 items in one go.
-        for i in range(start_id, end_id, 500):
-            if verbose:
-                print(" - Fetching information from cases {} to {}.".format(i, i + 500))
-            # Format URL based on the incremented index.
-            url = META_URL.format(start=i, length=500)
-            if verbose:
-                print(url)
-
-            # Get the response.
-            r = get_r(url, timeout, retry, verbose)
-            if r is not None:
-                # Get the results list
-                temp_dict = r.json()['results']
-                # Get every document from the results list.
-                for result in temp_dict:
-                    data.append(result['columns'])
-
-    else:
-        # Format URL based on start and length
-        url = META_URL.format(start=start_id, length=end_id)
-        if verbose:
-            print(url)
-
-        r = get_r(url, timeout, retry, verbose)
-        if r is not None:
-            # Get the results list
-            temp_dict = r.json()['results']
-            # Get every document from the results list.
-            for result in temp_dict:
-                data.append(result['columns'])
-
-    if len(data) == 0:
-        print("Search results ended up empty")
-        return False
-    return pd.DataFrame.from_records(data)
diff --git a/echr/echr_extractor/ECHR_nodes_edges_list_transform.py b/echr/echr_extractor/ECHR_nodes_edges_list_transform.py
deleted file mode 100644
index 2d4bd0c..0000000
--- a/echr/echr_extractor/ECHR_nodes_edges_list_transform.py
+++ /dev/null
@@ -1,301 +0,0 @@
-import numpy as np
-import pandas as pd
-import re
-import dateparser
-from echr_extractor.clean_ref import clean_pattern
-
-
-
-def open_metadata(PATH_metadata):
-    """
-    Finds the ECHR metadata file and loads it into a dataframe
-    
-    param filename_metadata: string with path to metadata
-    """
-    try:
-        df = pd.read_csv(PATH_metadata)  # change hard coded path
-        return df
-    except FileNotFoundError:
-        print("File not found. Please check the path to the metadata file.")
-        return False
-
-def concat_metadata(df):
-    agg_func = {'itemid' : 'first', 'appno' : 'first', 'article' : 'first', 'conclusion' : 'first' , 'docname' : 'first' , 'doctype' : 'first',
-                'doctypebranch' : 'first', 'ecli' : 'first', 'importance' : 'first', 'judgementdate' : 'first', 'languageisocode' : ', '.join, 'originatingbody' : 'first',
-                'violation' : 'first', 'nonviolation' : 'first', 'extractedappno' : 'first', 'scl' : 'first'}
-    new_df = df.groupby('ecli').agg(agg_func)
-    print(new_df)
-    return new_df
-
-def get_language_from_metadata(df):
-    df = concat_metadata(df)
-    df.to_json('langisocode-nodes.json', orient="records")
-
-def metadata_to_nodesedgeslist(df):
-    """
-    Returns a dataframe where column 'article' only contains a certain article
-
-    param df: the complete dataframe from the metadata
-    """
-    
-    return df
-
-
-def retrieve_nodes_list(df):
-    """
-    Returns a dataframe where 'ecli' is moved to the first column.
-
-    param df: the dataframe after article filter
-    """
-    df = metadata_to_nodesedgeslist(df)
-    col = df.pop("ecli")
-    df.insert(1, col.name, col)
-    df.drop(df.columns[0], axis=1, inplace=True)
-    return df
-
-
-def retrieve_edges_list(df, df_unfiltered):
-    """
-    Returns a dataframe consisting of 2 columns 'ecli' and 'reference' which
-    indicate a reference link between cases.
-
-    params:
-    df -- the node list extracted from the metadata
-    df_unfiltered -- the complete dataframe from the metadata
-    """
-    edges = pd.DataFrame(columns=['ecli', 'references'])
-
-    count = 0
-    tot_num_refs = 0
-    missing_cases = []
-    for index, item in df.iterrows():
-        eclis = []
-        app_number = []
-        extracted_appnos = []
-        if item.extractedappno is not np.nan:
-            extracted_appnos = item.extractedappno.split(';') 
-
-        if item.scl is not np.nan:
-            """
-            Split the references from the scl column i nto a list of references.
-
-            Example:
-            references in string: "Ali v. Switzerland, 5 August 1998, § 32, Reports of Judgments and 
-            Decisions 1998-V;Sevgi Erdogan v. Turkey (striking out), no. 28492/95, 29 April 2003"
-
-            ["Ali v. Switzerland, 5 August 1998, § 32, Reports of Judgments and 
-            Decisions 1998-V", "Sevgi Erdogan v. Turkey (striking out), no. 
-            28492/95, 29 April 2003"]
-            """
-            ref_list = item.scl.split(';')
-            new_ref_list = []
-            for ref in ref_list:
-                ref = re.sub('\n', '', ref)
-                new_ref_list.append(ref)
-
-            tot_num_refs = tot_num_refs + len(ref_list)
-
-            for ref in new_ref_list:
-                app_number = re.findall("[0-9]{3,5}\/[0-9]{2}", ref) ################
-                if len(extracted_appnos) > 0:
-                    app_number = app_number + extracted_appnos
-                # app_number = app_number + extracted_appnos
-                app_number = set(app_number)
-                
-                if len(app_number) > 0:
-                    # get dataframe with all possible cases by application number
-                    if len(app_number) > 1:
-                        app_number = [';'.join(app_number)]
-                    case = lookup_app_number(app_number, df_unfiltered)
-                else: # if no application number in reference
-                    # get dataframe with all possible cases by casename
-                    case = lookup_casename(ref, df_unfiltered)
-
-                if len(case) == 0:
-                    case = lookup_casename(ref, df_unfiltered)
-
-                components = ref.split(',')
-                # get the year of case
-                year_from_ref = get_year_from_ref(components)
-
-                # remove cases in different language than reference
-                for id, it in case.iterrows():
-                    if 'v.' in components[0]:
-                        lang = 'ENG'
-                    else:
-                        lang = 'FRE'
-
-                    if lang not in it.languageisocode:
-                        case = case[case['languageisocode'].str.contains(lang, regex=False, flags=re.IGNORECASE)]
-
-                for id, i in case.iterrows():
-                    if i.judgementdate is np.nan:
-                        continue
-                    date = dateparser.parse(i.judgementdate)
-                    year_from_case = date.year
-
-                    if year_from_case - year_from_ref == 0:
-                        case = case[case['judgementdate'].str.contains(str(year_from_ref), regex=False, flags=re.IGNORECASE)]
-
-                #case = metadata_to_nodesedgeslist(case)
-
-                if len(case) > 0:
-                    if len(case) > 3:
-                        print("stop")
-                    for _,row in case.iterrows():
-                        eclis.append(row.ecli)
-                else:
-                    count = count + 1
-                    missing_cases.append(ref)
-
-            eclis = set(eclis)
-
-            #add ecli to edges list
-            if len(eclis) == 0:
-                continue
-            else:
-                edges = pd.concat(
-                    [edges, pd.DataFrame.from_records([{'ecli': item.ecli, 'references': list(eclis)}])])
-
-    print("num missed cases: ", count)
-    print("total num of refs: ", tot_num_refs)
-    missing_cases_set = set(missing_cases)
-    missing_cases = list(missing_cases_set)
-    
-    # Store missing references
-    missing_df = pd.DataFrame(missing_cases)
-    # missing_df.to_csv('C:/Users/Chloe/PycharmProjects/case-law-explorer/data/echr/missing_cases.csv', index=False, encoding='utf-8')
-    edges = edges.groupby('ecli', as_index=False).agg({'references' : 'sum'})
-    return edges
-
-def lookup_app_number(pattern, df):
-    """
-    Returns a list with rows containing the cases linked to the found app numbers.
-    """
-    row = df.loc[df['appno'].isin(pattern)]
-
-    if row.empty:
-        return pd.DataFrame()
-    elif row.shape[0] > 1:
-        return row
-    else:
-        return row
-
-
-def lookup_casename(ref, df):
-    """
-    Process the reference for lookup in metadata.
-    Returns the rows corresponding to the cases.
-
-    - Example of the processing (2 variants) -
-
-    Original reference from scl:
-    - Hentrich v. France, 22 September 1994, § 42, Series A no. 296-A
-    - Eur. Court H.R. James and Others judgment of 21 February 1986,
-    Series A no. 98, p. 46, para. 81
-
-    Split on ',' and take first item:
-    Hentrich v. France
-    Eur. Court H.R. James and Others judgment of 21 February 1986
-
-    If certain pattern from CLEAN_REF in case name, then remove:
-    Eur. Court H.R. James and Others judgment of 21 February 1986 -->
-        James and Others
-
-    Change name to upper case and add additional text to match metadata:
-    Hentrich v. France --> CASE OF HENTRICH V. FRANCE
-    James and Others --> CASE OF JAMES AND OTHERS
-    """
-    name = get_casename(ref)
-    
-    # DEV note: In case, add more patterns to clean_ref.py in future
-    patterns = clean_pattern
-
-    uptext = name.upper()
-
-    if 'NO.' in uptext:
-        uptext = uptext.replace('NO.', 'No.')
-
-    if 'BV' in uptext:
-        uptext = uptext.replace('BV', 'B.V.')
-
-    if 'v.' in name:
-        uptext = uptext.replace('V.', 'v.')
-        lang = 'ENG'
-    else:
-        uptext = uptext.replace('C.', 'c.')
-        lang = 'FRE'
-
-    for pattern in patterns:
-        uptext = re.sub(pattern, '', uptext)
-
-    uptext = re.sub(r'\[.*', "", uptext)
-    uptext = uptext.strip()
-    row = df[df['docname'].str.contains(uptext, regex=False, flags=re.IGNORECASE)]
-
-    # if len(row) == 0:
-    #     print("no cases matched: ", name)
-
-    return row
-
-def get_casename(ref):
-    count = 0
-    if 'v.' in ref:
-        slice_at_versus = ref.split('v.')  # skip if typo (count how many)
-    elif 'c.' in ref:
-        slice_at_versus = ref.split('c.')
-    else:
-        count = count + 1
-        name = ref.split(',')
-        return name[0]
-
-    num_commas = slice_at_versus[0].count(',')
-
-    if num_commas > 0:
-        num_commas = num_commas + 1
-        name = ",".join(ref.split(",", num_commas)[:num_commas])
-    else:
-        name = ref.split(',')
-        return name[0]
-    return name
-
-def get_year_from_ref(ref):
-    for component in ref:
-        if '§' in component:
-            continue
-        component = re.sub('judgment of ', "", component)
-        if dateparser.parse(component) is not None:
-            date = dateparser.parse(component)
-        elif ("ECHR" in component or "CEDH" in component):
-            if ("ECHR" in component or "CEDH" in component):
-                date = re.sub('ECHR ', '', component)
-                date = re.sub('CEDH ', '', date)
-                date = date.strip()
-                date = re.sub('-.*', '', date)
-                date = re.sub('\s.*', '', date)
-                date = dateparser.parse(date)
-   
-    try:
-        return date.year
-    except:
-        return 0
-
-
-
-def echr_nodes_edges(metadata_path):
-    """
-    Create nodes and edges list for the ECHR data.
-    """
-    print('\n--- COLLECTING METADATA ---\n')
-    data = open_metadata(metadata_path)
-
-    print('\n--- EXTRACTING NODES LIST ---\n')
-    nodes = retrieve_nodes_list(data)
-    # get_language_from_metadata(nodes)
-
-    print('\n--- EXTRACTING EDGES LIST ---\n')
-    edges = retrieve_edges_list(nodes, data)
-
-    # nodes.to_json(JSON_ECHR_NODES, orient="records")
-    # edges.to_json(JSON_ECHR_EDGES, orient="records")
-    return nodes, edges
\ No newline at end of file
diff --git a/echr/echr_extractor/__init__.py b/echr/echr_extractor/__init__.py
deleted file mode 100644
index a3d8f80..0000000
--- a/echr/echr_extractor/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from echr_extractor.echr import get_echr,get_echr_extra,get_nodes_edges
\ No newline at end of file
diff --git a/echr/echr_extractor/clean_ref.py b/echr/echr_extractor/clean_ref.py
deleted file mode 100644
index 5eb8d18..0000000
--- a/echr/echr_extractor/clean_ref.py
+++ /dev/null
@@ -1,9 +0,0 @@
-'''
-This module contains the list of patterns for reference lookup in metadata
-'''
-
-
-clean_pattern = ['EUR. COURT H.R.',
-          'JUDGMENT OF.*',
-          ' DU.*'
-          ]
\ No newline at end of file
diff --git a/echr/echr_extractor/echr.py b/echr/echr_extractor/echr.py
deleted file mode 100644
index 1ed5987..0000000
--- a/echr/echr_extractor/echr.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from echr_extractor.ECHR_metadata_harvester import get_echr_metadata
-from echr_extractor.ECHR_html_downloader import download_full_text_main
-from echr_extractor.ECHR_nodes_edges_list_transform import echr_nodes_edges
-from pathlib import Path
-import os
-import json
-
-"""
-I have replaced the function definition to take all arguments n eede to call read_echr_metadata and I have also
-replaced the file naming lines. The old code is commented out I didn't delete anything. :)
-On top of this the lines which deal with defining default values have been commented out because this is
-handled in read_echr_metadata. It can also be done here but then it should be removed from the other method. 
-It will probably be necissary to add some file handling to prevent overwriting. I'm not sure if you have plans for this
-already but feel free to shoot me a message about it seeing as I did something similar for the ECHR branch. @Benjamin
-"""
-
-
-def get_echr(start_id=0, end_id=None, start_date=None, count=None, end_date=None, verbose=False, save_file='y',
-             fields=None, link=None, language=None):
-    if language is None:
-        language = ["ENG"]
-    if count:
-        end_id = int(start_id) + count
-    print(f"--- STARTING ECHR DOWNLOAD FOR  ---")
-    df = get_echr_metadata(start_id=start_id, end_id=end_id, start_date=start_date, end_date=end_date,
-                                        verbose=verbose, fields=fields, link=link, language=language)
-    if df is False:
-        return False
-    if save_file == "y":
-        filename = determine_filename(start_id, end_id, start_date, end_date)
-        Path('data').mkdir(parents=True, exist_ok=True)
-        file_path = os.path.join('data', filename + '.csv')
-        df.to_csv(file_path, index=False)
-        print("\n--- DONE ---")
-        return df
-    else:
-        print("\n--- DONE ---")
-        return df
-
-
-def determine_filename(start_id, end_id, start_date, end_date):
-    if end_id:
-        if start_date and end_date:
-            filename = f"echr_metadata_index_{start_id}-{end_id}_dates_{start_date}-{end_date}"
-        elif start_date:
-            filename = f"echr_metadata_{start_id}-{end_id}_dates_{start_date}-END"
-        elif end_date:
-            filename = f"echr_metadata_{start_id}-{end_id}_datesSTART-{end_date}"
-        else:
-            filename = f"echr_metadata_{start_id}-{end_id}_dates_START-END"
-    else:
-        if start_date and end_date:
-            filename = f"echr_metadata_index_{start_id}-ALL_dates_{start_date}-{end_date}"
-        elif start_date:
-            filename = f"echr_metadata_{start_id}-ALL_dates_{start_date}-END"
-        elif end_date:
-            filename = f"echr_metadata_{start_id}-ALL_dates_START-{end_date}"
-        else:
-            filename = f"echr_metadata_{start_id}-ALL_dates_START-END"
-    return filename
-
-
-def get_echr_extra(start_id=0, end_id=None, start_date=None, count=None, end_date=None, verbose=False,
-                   save_file='y', threads=10, fields=None, link=None, language=None):
-    df = get_echr(start_id=start_id, end_id=end_id, start_date=start_date, end_date=end_date, verbose=verbose,
-                  count=count, save_file='n', fields=fields, link=link, language=language)
-    print("Full-text download will now begin")
-    if df is False:
-        return False, False
-    json_list = download_full_text_main(df, threads)
-    print("Full-text download finished")
-    if save_file == "y":
-        filename = determine_filename(start_id, end_id, start_date, end_date)
-        filename_json = filename.replace("metadata", "full_text")
-        Path('data').mkdir(parents=True, exist_ok=True)
-        file_path = os.path.join('data', filename + '.csv')
-        df.to_csv(file_path, index=False)
-        file_path_json = os.path.join('data', filename_json + '.json')
-        with open(file_path_json, "w") as f:
-            json.dump(json_list, f)
-        return df, json_list
-    else:
-        return df, json_list
-
-
-def get_nodes_edges(metadata_path, save_file='y'):
-    nodes, edges = echr_nodes_edges(metadata_path)
-    if save_file == "y":
-        Path('data').mkdir(parents=True, exist_ok=True)
-        edges.to_csv(os.path.join('data', 'ECHR_edges.csv'), index=False, encoding='utf-8')
-        nodes.to_csv(os.path.join('data', 'ECHR_nodes.csv'), index=False, encoding='utf-8')
-        nodes.to_json(os.path.join('data', 'ECHR_nodes.json'), orient="records")
-        edges.to_json(os.path.join('data', 'ECHR_edges.json'), orient="records")
-        return nodes, edges
-
-    return nodes, edges
diff --git a/echr/echr_extractor/testing_file.py b/echr/echr_extractor/testing_file.py
deleted file mode 100644
index 29ff259..0000000
--- a/echr/echr_extractor/testing_file.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os,sys
-from os.path import dirname, abspath
-from pathlib import Path,PurePath
-
-current_dir = (abspath(__file__))
-correct_dir = '\\'.join(current_dir.replace('\\', '/').split('/')[:-2])
-sys.path.append(correct_dir)
-# print(sys.path)
-
-
-from echr_extractor.echr import get_echr_extra, get_echr, get_nodes_edges
-import dateutil.parser
-
-import datetime
-if __name__ == '__main__':
-    df = get_echr_extra(count=100,save_file='y',language=["FRE","ENG"],start_date='2023-01-01')
-
-
-
-
-    """
-    Start and end dates must be date objects, which can be achieved by calling dateutil.parser.parse(some date string).date().
-    I assume you dont want to do that in this file but im not sure where this conversion is most appropriate so I'll leave it up to you.
-    Note that there is an extra import because of this.
-    I have commented out some of your stuff to test this, if you run it as is it should work. @Benjamin
-    """
-    print(str(datetime.datetime.today().date()))
-    #df = get_echr_extra(count=100,threads=5,start_date='2000-01-01',end_date='2023-01-01')
-    #df,json = get_echr_extra(start_id=20,end_id=3000,save_file='n')
-
-    #df = get_echr(start_id=1000,count=2000,save_file='n')
-
diff --git a/echr/setup.py b/echr/setup.py
deleted file mode 100644
index 8c5805a..0000000
--- a/echr/setup.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This file is required to create a python library
-
-from setuptools import find_packages, setup
-from pathlib import Path
-
-p = Path("README.md")
-long_descr = p.read_text()
-
-setup(
-    name='echr_extractor',
-    packages=find_packages(include=['echr_extractor']),
-    version='1.0.21',
-    description='Library for extracting ECHR data',
-    author='LawTech Lab',
-    license='MIT',
-    install_requires=["requests~=2.26.0","pandas~=1.2.5","beautifulsoup4~=4.9.3", "dateparser"],
-    author_email='a.gade@student.maastrichtuniversity.nl',
-    keywords=['echr', 'extractor', 'european', 'convention', 'human', 'rights', 'european convention', 'human rights',
-              'european convention on human rights'],
-    long_description=long_descr,
-    long_description_content_type='text/markdown',
-    project_urls={
-        "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries",
-        "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries",
-    },
-)
\ No newline at end of file
diff --git a/rechtspraak/README.md b/rechtspraak/README.md
deleted file mode 100644
index a3904cf..0000000
--- a/rechtspraak/README.md
+++ /dev/null
@@ -1,159 +0,0 @@
-## Rechtspraak extractor
-This library contains two functions to get rechtspraak data and metadata from the API.
-
-## Version
-Python 3.9
-
-## Contributors
-
-<!-- readme: contributors,gijsvd -start -->
-<table>
-<tr>
-    <td align="center">
-        <a href="https://github.com/pranavnbapat">
-            <img src="https://avatars.githubusercontent.com/u/7271334?v=4" width="100;" alt="pranavnbapat"/>
-            <br />
-            <sub><b>Pranav Bapat</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/running-machin">
-            <img src="https://avatars.githubusercontent.com/u/60750154?v=4" width="100;" alt="running-machin"/>
-            <br />
-            <sub><b>running-machin</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/Cloud956">
-            <img src="https://avatars.githubusercontent.com/u/24865274?v=4" width="100;" alt="Cloud956"/>
-            <br />
-            <sub><b>Piotr Lewandowski</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/shashankmc">
-            <img src="https://avatars.githubusercontent.com/u/3445114?v=4" width="100;" alt="shashankmc"/>
-            <br />
-            <sub><b>shashankmc</b></sub>
-        </a>
-    </td>
-    <td align="center">
-        <a href="https://github.com/gijsvd">
-            <img src="https://avatars.githubusercontent.com/u/31765316?v=4" width="100;" alt="gijsvd"/>
-            <br />
-            <sub><b>gijsvd</b></sub>
-        </a>
-    </td>
-</tr>
-</table>
-<!-- readme: contributors,gijsvd -end -->
-
-## How to install?
-<code>pip install rechtspraak_extractor</code>
-
-## What are the functions?
-<li><b>Rechtspraak Extractor</b>
-<ol>
-    <li><code>get_rechtspraak</code></li>
-    Gets all the ECLIs and saves them in the CSV file or in-memory.
-    <br>It gets, ECLI, title, summary, updated date, link.
-    <li><code>get_rechtspraak_metadata</code></li>
-    Gets the metadata of the ECLIs created by above function and saves them in the new CSV file or in-memory.
-    <br>Link attribute that we get from the above function contains the links of ECLI metadata.
-    <br>It gets instantie, datum uitspraak, datum publicatie, zaaknummer, rechtsgebieden, bijzondere kenmerken, 
-    inhoudsindicatie, and vindplaatsen
-</ol> </li>
-
-## What are the parameters?
-<ol>
-    <li><strong>get_rechtspraak(max_ecli=100, sd='2022-05-01', ed='2022-10-01', save_file='y')</strong></li>
-    <strong>Parameters:</strong>
-    <ul>
-        <li><strong>max_ecli: int, optional</strong></li>
-        Maximum amount of ECLIs to retrieve
-        <br>Default: 100
-        <li><strong>sd: date, optional, default '2022-08-01'</strong></li>
-        The start publication date (yyyy-mm-dd)
-        <li><strong>ed: date, optional, default current date</strong></li>
-        The end publication date (yyyy-mm-dd)
-        <li><strong>save_file: ['y', 'n'], default 'y'</strong></li>
-        y - Save data as a CSV file in data folder
-        <br>n - Save data as a dataframe in-memory
-    </ul>
-    <li><code>get_rechtspraak_metadata</code></li>
-    <ul>
-        <li><strong>save_file: ['y', 'n'], default 'y'</strong></li>
-        y - Save data as a CSV file in data folder
-        <br>n - Save data as a dataframe in-memory
-        <li><strong>dataframe: dataframe, optional</strong></li>
-        Dataframe containing ECLIs to retrieve metadata. Cannot be combined with filename
-        <li><strong>filename: string, optional</strong></li>
-        CSV file containing ECLIs to retrieve metadata. Cannot be combined with dataframe
-    </ul>
-</ol>
-
-
-## Examples
-```
-import rechtspraak_extractor as rex
-
------------------------------------------------------------------------------------------------------------------------
-
-# For rechtspraak
-
-# To get the rechtspraak data in a dataframe:
-df = rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='n')  # Gets 100 ECLIs from 1st August 2022
-
-# To save rechtspraak data as a CSV file:
-rex.get_rechtspraak(max_ecli=100, sd='2022-08-01', save_file='y') 
-
------------------------------------------------------------------------------------------------------------------------
-
-# For rechtspraak metadata
-
-# To get metadata as a dataframe from rechtspraak data (as a dataframe):
-df_metadata = rex.get_rechtspraak_metadata(save_file='n', dataframe=df)
-
-# To get metadata as a dataframe from rechtspraak file (as a dataframe):
-df_metadata = rex.get_rechtspraak_metadata(save_file='n', filename='rechtspraak.csv')
-
-# To get metadata as a dataframe from rechtspraak data (saved as CSV file):
-rex.get_rechtspraak_metadata(save_file='y', dataframe=df)
-
-# To get metadata and save as a CSV file:
-rex.get_rechtspraak_metadata(save_file='y', filename='rechtspraak.csv')
-
------------------------------------------------------------------------------------------------------------------------
-
-# filename='rechtspraak.csv' - filename.csv is a file from the data folder created by get_rechtspraak method
-# dataframe=df - df is a dataframe created by get_rechtspraak method
-
-# Will not get any metadata
-df = rex.get_rechtspraak_metadata(save_file='n')
-
-# Will get the metadata of all the files in the data folder
-rex.get_rechtspraak_metadata(save_file='y')
-```
-
-
-## License
-[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0)
-
-Previously under the [MIT License](https://opensource.org/licenses/MIT), as of 28/10/2022 this work is licensed under a [Apache License, Version 2.0](https://opensource.org/licenses/Apache-2.0).
-```
-Apache License, Version 2.0
-
-Copyright (c) 2022 Maastricht Law & Tech Lab
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-```
diff --git a/rechtspraak/rechtspraak_extractor/__init__.py b/rechtspraak/rechtspraak_extractor/__init__.py
deleted file mode 100644
index 13f583b..0000000
--- a/rechtspraak/rechtspraak_extractor/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# from rechtspraak_extractor import rechtspraak
-# from rechtspraak_extractor import rechtspraak_metadata
-from rechtspraak_extractor.rechtspraak import get_rechtspraak
-from rechtspraak_extractor.rechtspraak_metadata import get_rechtspraak_metadata
diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak.py b/rechtspraak/rechtspraak_extractor/rechtspraak.py
deleted file mode 100644
index aaac035..0000000
--- a/rechtspraak/rechtspraak_extractor/rechtspraak.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# This file is used to get all the Rechtspraak ECLIs from an API.
-# It takes two required arguments and one optional argument
-# 1. max - Maximum number of ECLIs to retrieve
-# 2. starting-date (yyyy-mm-dd) - Start date of ECLI publication
-# 3. ending-date (yyyy-mm-dd) - It's an optional parameter. If not given, current date will be automatically chosen
-# File is stored in data/rechtspraak folder
-
-import json
-import xmltodict
-import os
-from datetime import date, datetime
-from rechtspraak_extractor.rechtspraak_functions import *
-
-
-# Define base URL
-RECHTSPRAAK_API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?"
-
-
-def get_data_from_url(url):
-    res = requests.get(url)
-    res.raw.decode_content = True
-
-    # Convert the XML data to JSON format
-    xpars = xmltodict.parse(res.text)
-    json_string = json.dumps(xpars)
-    json_object = json.loads(json_string)
-
-    # Get the JSON object from a specific branch
-    json_object = json_object['feed']['entry']
-
-    return json_object
-
-
-def save_csv(json_object, file_name, save_file):
-    # Define the dataframe to enter the data
-    df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link'])
-    ecli_id = []
-    title = []
-    summary = []
-    updated = []
-    link = []
-
-    # Iterate over the object and fill the lists
-    for i in json_object:
-        ecli_id.append(i['id'])
-        title.append(i['title']['#text'])
-        if '#text' in i['summary']:
-            summary.append(i['summary']['#text'])
-        else:
-            summary.append("No summary available")
-        updated.append(i['updated'])
-        link.append(i['link']['@href'])
-
-    # Save the lists to dataframe
-    df['id'] = ecli_id
-    df['title'] = title
-    df['summary'] = summary
-    df['updated'] = updated
-    df['link'] = link
-
-    if save_file == 'y':
-        # Create directory if not exists
-        Path('data').mkdir(parents=True, exist_ok=True)
-
-        # Save CSV file
-        # file_path = os.path.join('data', file_name + '.csv')
-        df.to_csv('data/' + file_name + '.csv', index=False, encoding='utf8')
-        print("Data saved to CSV file successfully.")
-    return df
-
-def get_rechtspraak(max_ecli=100, sd='1900-01-01', ed=None, save_file='y'):
-    print("Rechtspraak dump downloader API")
-
-    amount = max_ecli
-    starting_date = sd
-    save_file = save_file
-
-    # If the end date is not entered, the current date is taken
-    today = date.today()
-    if ed:
-        ending_date = ed
-    else:
-        ending_date = today.strftime("%Y-%m-%d")
-
-    # Used to calculate total execution time
-    start_time = time.time()
-
-    # Build the URL after getting all the arguments
-    url = RECHTSPRAAK_API_BASE_URL + 'max=' + str(amount) + '&date=' + starting_date + '&date=' + ending_date
-
-    print("Checking the API")
-    # Check the working of API
-    response_code = check_api(url)
-    if response_code == 200:
-        print("API is working fine!")
-        print("Getting " + str(amount) + " documents from " + starting_date + " till " + ending_date)
-
-        json_object = get_data_from_url(url)
-        print(f"Found {len(json_object)} cases!")
-        if json_object:
-            # Get current time
-            current_time = datetime.now().strftime("%H-%M-%S")
-
-            # Build file name
-            file_name = 'rechtspraak_' + starting_date + '_' + ending_date + '_' + current_time
-
-
-            get_exe_time(start_time)
-
-            if save_file == 'n':
-                global_rs_df = save_csv(json_object, file_name, save_file)
-                return global_rs_df
-            else:
-                save_csv(json_object, file_name, save_file)
-                return
-    else:
-        print(f"URL returned with a {response_code} error code")
diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py b/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py
deleted file mode 100644
index 02fa28b..0000000
--- a/rechtspraak/rechtspraak_extractor/rechtspraak_functions.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import requests, glob, time
-from pathlib import Path
-import pandas as pd
-
-
-# Check whether the API is working or not and return with the response code
-def check_api(url):
-    response = requests.get(f"{url}")
-
-    # Return with the response code
-    return response.status_code
-
-
-# Reads all the CSV files in a folder and returns the list of files
-# It also has an optional parameter "exclude". By default, it's None. If you want to exclude files having a certain
-# word in the file name, you may give a value
-# It also only grabs data if it has rechtspraak in it
-# As that was causing issues with other csv data present
-def read_csv(dir_name, exclude=None):
-    path = dir_name
-    csv_files = glob.glob(path + "/*.csv")
-    files = []
-    for i in csv_files:
-        if exclude is not None:
-            if exclude not in i and "rechtspraak" in i:
-                files.append(i)
-        else:
-            if "rechtspraak" in i:
-                files.append(i)
-
-    print("Found " + str(len(files)) + " CSV file(s)\n")
-    return files
-
-
-# Get total execution time
-def get_exe_time(start_time):
-    end_time = time.time()
-    sec = end_time - start_time
-    mins = sec // 60
-    sec = sec % 60
-    hours = mins // 60
-    mins = mins % 60
-    print("Total execution time: {0}:{1}:{2}".format(int(hours), int(mins), round(sec, 2)))
-    print("\n")
diff --git a/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py b/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py
deleted file mode 100644
index c315359..0000000
--- a/rechtspraak/rechtspraak_extractor/rechtspraak_metadata.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# This file is used for getting the metadata of the ECLIs obtained using rechspraak_api file. This file takes all the
-# CSV file created by rechspraak_api, picks up ECLIs and links column, and using an API gets the metadata and saves it
-# in another CSV file with metadata suffix.
-# This happens in async manner.
-import pathlib
-import os
-import urllib
-import multiprocessing
-from bs4 import BeautifulSoup
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
-import platform
-import shutil
-from tqdm import tqdm
-from rechtspraak_extractor.rechtspraak_functions import *
-from functools import partial
-# Define base url
-RECHTSPRAAK_METADATA_API_BASE_URL = "http://data.rechtspraak.nl/uitspraken/content?id=" # old one = "https://uitspraken.rechtspraak.nl/#!/details?id="
-return_type = "&return=DOC"
-
-# Define empty lists where we'll store our data temporarily
-ecli_df = []
-full_text_df = []
-creator_df = []
-date_decision_df = []
-issued_df = []
-zaaknummer_df = []
-type_df = []
-relations_df = []
-references_df = []
-subject_df = []
-procedure_df = []
-inhoudsindicatie_df = []
-hasVersion_df = []
-
-threads = []
-max_workers = 0
-
-
-def get_cores():
-    # max_workers is the number of concurrent processes supported by your CPU multiplied by 5.
-    # You can change it as per the computing power.
-    # Different python versions treat this differently. This is written as per python 3.6.
-    n_cores = multiprocessing.cpu_count()
-
-    global max_workers
-    max_workers = n_cores-1
-    # If the main process is computationally intensive: Set to the number of logical CPU cores minus one.
-
-    print(f"Maximum " + str(max_workers) + " threads supported by your machine.")
-
-
-def extract_data_from_xml(url):
-    with urllib.request.urlopen(url) as response:
-        xml_file = response.read()
-        return xml_file
-
-
-
-def check_if_df_empty(df):
-    if df.empty:
-        return True
-    return False
-
-
-def get_text_if_exists(el):
-    try:
-        return el.text
-    except:
-        return ''
-
-def update_bar(bar, *args):
-    bar.update(1)
-
-
-def save_data_when_crashed(ecli):
-    ecli_df.append(ecli)
-    full_text_df.append("")
-    creator_df.append("")
-    date_decision_df.append("")
-    issued_df.append("")
-    zaaknummer_df.append("")
-    type_df.append("")
-    relations_df.append("")
-    references_df.append("")
-    subject_df.append("")
-    procedure_df.append("")
-    inhoudsindicatie_df.append("")
-    hasVersion_df.append("")
-def get_data_from_api(ecli_id):
-    url = RECHTSPRAAK_METADATA_API_BASE_URL + ecli_id + return_type
-    try:
-        response_code = check_api(url)
-    except:
-        save_data_when_crashed(ecli_id)
-        return
-    global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, type_df, \
-        relations_df, references_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df
-    try:
-        if response_code == 200:
-            try:
-                # Extract data from xml
-                xml_object = extract_data_from_xml(url)
-                soup = BeautifulSoup(xml_object, features='xml')
-                # Get the data
-                creator = get_text_if_exists(soup.find("dcterms:creator"))
-                date_decision = get_text_if_exists(soup.find("dcterms:date"))
-                issued = get_text_if_exists(soup.find("dcterms:issued"))
-                zaaknummer = get_text_if_exists(soup.find("psi:zaaknummer"))
-                rs_type = get_text_if_exists(soup.find("dcterms:type"))
-                subject = get_text_if_exists(soup.find("dcterms:subject"))
-                relation = soup.findAll("dcterms:relation")
-                relatie = ''
-                for i in relation:
-                    # append the string to relation
-                    text = get_text_if_exists(i)
-                    if text == '':
-                        continue
-                    else:
-                        relatie += text + "\n"
-                relations = relatie
-                reference = soup.findAll("dcterms:references")
-                ref = ''
-                for u in reference:
-                    text = get_text_if_exists(u)
-                    # append the string to relation
-                    if text =="":
-                        continue
-                    else:
-                        ref += text + "\n"
-                references = ref    
-                procedure = get_text_if_exists(soup.find("psi:procedure"))
-                inhoudsindicatie = get_text_if_exists(soup.find("inhoudsindicatie"))
-                hasVersion = get_text_if_exists(soup.find("dcterms:hasVersion"))
-                full_text = get_text_if_exists(soup.find("uitspraak"))
-
-                ecli_df.append(ecli_id)
-                full_text_df.append(full_text)
-                creator_df.append(creator)
-                date_decision_df.append(date_decision)
-                issued_df.append(issued)
-                zaaknummer_df.append(zaaknummer)
-                type_df.append(rs_type)
-                relations_df.append(relations)
-                references_df.append(references)
-                subject_df.append(subject)
-                procedure_df.append(procedure)
-                inhoudsindicatie_df.append(inhoudsindicatie)
-                hasVersion_df.append(hasVersion)
-                del full_text, creator, date_decision, issued, zaaknummer,relations, rs_type,\
-                    references, subject,procedure, inhoudsindicatie, hasVersion
-
-                urllib.request.urlcleanup()
-
-            except Exception as e:
-                save_data_when_crashed(ecli_id)
-        else:
-            save_data_when_crashed(ecli_id)
-    except Exception as e:
-        save_data_when_crashed(ecli_id)
-
-
-def get_rechtspraak_metadata(save_file='n', dataframe=None, filename=None):
-    if dataframe is not None and filename is not None:
-        print(f"Please provide either a dataframe or a filename, but not both")
-        return False
-
-    if dataframe is None and filename is None and save_file == 'n':
-        print(f"Please provide at least a dataframe of filename when the save_file is \"n\"")
-        return False
-
-    print("Rechtspraak metadata API")
-
-    start_time = time.time()  # Get start time
-
-    no_of_rows = ''
-    rs_data = ''
-    csv_files = 0
-
-    # Check if dataframe is provided and is correct
-    if dataframe is not None:
-        if 'id' in dataframe and 'link' in dataframe:
-            rs_data = dataframe
-            no_of_rows = rs_data.shape[0]
-        else:
-            print("Dataframe is corrupted or does not contain necessary information to get the metadata.")
-            return False
-
-    # Check if filename is provided and is correct
-    if filename is not None:
-        print("Reading " + filename + " from data folder")
-        file_check = pathlib.Path("data/" + filename)
-        if file_check.is_file():
-            print("File found. Checking if metadata already exists")
-            # Check if metadata already exists
-            file_check = Path("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4]
-                              + "_metadata.csv")
-            if file_check.is_file():
-                print("Metadata for " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] +
-                      ".csv already exists.")
-                return False
-            else:
-                rs_data = pd.read_csv('data/' + filename)
-                if 'id' in rs_data and 'link' in rs_data:
-                    no_of_rows = rs_data.shape[0]
-                else:
-                    print("File is corrupted or does not contain necessary information to get the metadata.")
-                    return False
-        else:
-            print("File not found. Please check the file name.")
-            return False
-
-    get_cores()  # Get number of cores supported by the CPU
-
-    if dataframe is None and filename is None and save_file == 'y':
-        print("No dataframe or file name is provided. Getting the metadata of all the files present in the "
-              "data folder")
-
-        print("Reading all CSV files in the data folder...")
-        csv_files = read_csv('data', "metadata")
-
-        global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \
-           type_df, relations_df,references_df, subject_df,\
-           procedure_df, inhoudsindicatie_df, hasVersion_df
-        if len(csv_files) > 0 and save_file == 'y':
-            for f in csv_files:
-                # Create empty dataframe
-                rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision',
-                                               'issued', 'zaaknummer','type',"relations",
-                                                'references','subject','procedure',
-                                                'inhoudsindicatie', 'hasVersion'])
-
-                temp_file_name = f.split('\\')[-1][:len(f.split('\\')[-1]) - 4]
-
-                # Check if file already exists
-                file_check = Path("data/" + temp_file_name + "_metadata.csv")
-                if file_check.is_file():
-                    print("Metadata for " + temp_file_name + ".csv already exists.")
-                    continue
-
-                df = pd.read_csv(f)
-                no_of_rows = df.shape[0]
-                print("Getting metadata of " + str(no_of_rows) + " ECLIs from " + temp_file_name + ".csv")
-                print("Working. Please wait...")
-
-                # Get all ECLIs in a list
-                ecli_list = list(df.loc[:, 'id'])
-
-                # Create a temporary directory to save files
-                time.sleep(1)
-                Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
-                with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                    for ecli in ecli_list:
-                        threads.append(executor.submit(get_data_from_api, ecli))
-
-                # Delete temporary directory
-                shutil.rmtree('temp_rs_data')
-                # executor.shutdown()  # Shutdown the executor
-
-                rsm_df['ecli'] = ecli_df
-                rsm_df['full_text'] = full_text_df
-                rsm_df['creator'] = creator_df
-                rsm_df['date_decision'] = date_decision_df
-                rsm_df['issued'] = issued_df
-                rsm_df['zaaknummer'] = zaaknummer_df
-                rsm_df['type'] = type_df
-                rsm_df['relations'] = relations_df
-                rsm_df['references'] = references_df
-                rsm_df['subject'] = subject_df
-                rsm_df['procedure'] = procedure_df
-                rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
-                rsm_df['hasVersion'] = hasVersion_df
-                addition = rs_data[['id', 'summary']]
-                rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1)
-                # Create directory if not exists
-                Path('data').mkdir(parents=True, exist_ok=True)
-
-                if check_if_df_empty(rsm_df):
-                    print("Metadata not found. Please check the API response; either API is under maintenance, "
-                          "experiencing problems, or has changed. Please try again after some time or contact the "
-                          "administrator.\n")
-                else:
-                    # Save CSV file
-                    print("Creating CSV file...")
-                    rsm_df.to_csv("data/" + temp_file_name + "_metadata.csv", index=False, encoding='utf8')
-                    print("CSV file " + temp_file_name + "_metadata.csv  successfully created.\n")
-
-                # Clear the lists for the next file
-                ecli_df = []
-                full_text_df = []
-                creator_df = []
-                date_decision_df = []
-                issued_df = []
-                zaaknummer_df = []
-                type_df = []
-                relations_df = []
-                references_df = []
-                subject_df = []
-                procedure_df = []
-                inhoudsindicatie_df = []
-                hasVersion_df = []
-                ecli_list = []
-                del rsm_df
-            return True
-
-    if rs_data is not None:
-        rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision', 'issued',
-                                       'zaaknummer','type','relations','references', 'subject', 'procedure',
-                                        'inhoudsindicatie','hasVersion'])
-
-        print("Getting metadata of " + str(no_of_rows) + " ECLIs")
-        print("Working. Please wait...")
-        # Get all ECLIs in a list
-        ecli_list = list(rs_data.loc[:, 'id'])
-
-        # Create a temporary directory to save files
-        Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
-        time.sleep(1)
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            bar = tqdm(total=len(ecli_list), colour="GREEN",position=0, leave=True, miniters=int(len(ecli_list)/100),
-                       maxinterval=10000)
-            for ecli in ecli_list:
-                threads.append(executor.submit(get_data_from_api, ecli))
-            for t in threads:
-                t.add_done_callback(partial(update_bar,bar))
-        # Delete temporary directory
-        shutil.rmtree('temp_rs_data')
-         # to finish unfinished?
-        # global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \
-        #    relations_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df
-
-        rsm_df['ecli'] = ecli_df
-        rsm_df['full_text'] = full_text_df
-        rsm_df['creator'] = creator_df
-        rsm_df['date_decision'] = date_decision_df
-        rsm_df['issued'] = issued_df
-        rsm_df['zaaknummer'] = zaaknummer_df
-        rsm_df['type'] = type_df
-        rsm_df['relations'] = relations_df
-        rsm_df['references'] = references_df
-        rsm_df['subject'] = subject_df
-        rsm_df['procedure'] = procedure_df
-        rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
-        rsm_df['hasVersion'] = hasVersion_df
-        addition = rs_data[['id','summary']]
-        rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1)
-        if save_file == 'y':
-            if filename is None or filename == '':
-                filename = "custom_rechtspraak_" + datetime.now().strftime("%H-%M-%S") + ".csv"
-            # Create directory if not exists
-            Path('data').mkdir(parents=True, exist_ok=True)
-
-            if check_if_df_empty(rsm_df):
-                print("Metadata not found. Please check the API response; either API is under maintenance, "
-                      "experiencing problems, or has changed. Please try again after some time or contact the "
-                      "administrator.\n")
-            else:
-                # Save CSV file
-                print("Creating CSV file...")
-                rsm_df.to_csv("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv",
-                              index=False, encoding='utf8')
-                print("CSV file " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv" +
-                      " successfully created.\n")
-
-        # Clear the lists for the next file
-        ecli_df = []
-        full_text_df = []
-        creator_df = []
-        date_decision_df = []
-        issued_df = []
-        zaaknummer_df = []
-        type_df = []
-        relations_df = []
-        references_df = []
-        subject_df = []
-        procedure_df = []
-        inhoudsindicatie_df = []
-        hasVersion_df = []
-        ecli_list = []
-
-        get_exe_time(start_time)
-
-        if save_file == 'n':
-            return rsm_df
-
-        return True
-
diff --git a/rechtspraak/rechtspraak_extractor/testing_file.py b/rechtspraak/rechtspraak_extractor/testing_file.py
deleted file mode 100644
index 63b7fd6..0000000
--- a/rechtspraak/rechtspraak_extractor/testing_file.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from rechtspraak import *
-from rechtspraak_metadata import *
-df = get_rechtspraak(ed='1995-01-01',save_file='n',max_ecli=1000000)
-df_2 = get_rechtspraak_metadata(save_file='n',dataframe=df)
-b=2
-pass
\ No newline at end of file
diff --git a/rechtspraak/rechtspraak_extractor/tests/__init__.py b/rechtspraak/rechtspraak_extractor/tests/__init__.py
deleted file mode 100644
index 84b56fc..0000000
--- a/rechtspraak/rechtspraak_extractor/tests/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import sys
-import pathlib
-
-from rechtspraak import get_rechtspraak
-from rechtspraak_metadata import get_rechtspraak_metadata
-
-df = get_rechtspraak(max_ecli=50, sd='2022-08-01', save_file='y')
-
-# df = get_rechtspraak_metadata(save_file='n')
-
-print(df.head())
-print(df.shape)
\ No newline at end of file
diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py
deleted file mode 100644
index 3bd72c4..0000000
--- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# This file is used to get all the Rechtspraak ECLIs from an API.
-# It takes two required arguments and one optional argument
-# 1. max - Maximum number of ECLIs to retrieve
-# 2. starting-date (yyyy-mm-dd) - Start date of ECLI publication
-# 3. ending-date (yyyy-mm-dd) - It's an optional parameter. If not given, current date will be automatically chosen
-# File is stored in data/rechtspraak folder
-
-import json
-import xmltodict
-import os
-from datetime import date, datetime
-from rechtspraak_extractor.rechtspraak_functions import *
-
-
-# Define base URL
-RECHTSPRAAK_API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?"
-
-rs_ecli_df = []
-rs_title_df = []
-rs_summary_df = []
-rs_updated_df = []
-rs_link_df = []
-
-
-def get_data_from_url(url):
-    res = requests.get(url)
-    res.raw.decode_content = True
-
-    # Convert the XML data to JSON format
-    xpars = xmltodict.parse(res.text)
-    json_string = json.dumps(xpars)
-    json_object = json.loads(json_string)
-
-    # Get the JSON object from a specific branch
-    json_object = json_object['feed']['entry']
-
-    return json_object
-
-
-def save_csv(json_object, file_name, save_file):
-    # Define the dataframe to enter the data
-    df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link'])
-    ecli_id = []
-    title = []
-    summary = []
-    updated = []
-    link = []
-
-    # Iterate over the object and fill the lists
-    for i in json_object:
-        ecli_id.append(i['id'])
-        title.append(i['title']['#text'])
-        if '#text' in i['summary']:
-            summary.append(i['summary']['#text'])
-        else:
-            summary.append("No summary available")
-        updated.append(i['updated'])
-        link.append(i['link']['@href'])
-
-    # Save the lists to dataframe
-    df['id'] = ecli_id
-    df['title'] = title
-    df['summary'] = summary
-    df['updated'] = updated
-    df['link'] = link
-
-    if save_file == 'y':
-        # Create directory if not exists
-        Path('data').mkdir(parents=True, exist_ok=True)
-
-        # Save CSV file
-        # file_path = os.path.join('data', file_name + '.csv')
-        df.to_csv('data/' + file_name + '.csv', index=False, encoding='utf8')
-        print("Data saved to CSV file successfully.")
-    else:
-        rs_ecli_df.extend(ecli_id)
-        rs_title_df.extend(title)
-        rs_summary_df.extend(summary)
-        rs_updated_df.extend(updated)
-        rs_link_df.extend(link)
-
-
-def get_rechtspraak(max_ecli=100, sd='2022-08-01', ed=None, save_file='y'):
-    print("Rechtspraak dump downloader API")
-
-    amount = max_ecli
-    starting_date = sd
-    save_file = save_file
-
-    # If the end date is not entered, the current date is taken
-    today = date.today()
-    if ed:
-        ending_date = ed
-    else:
-        ending_date = today.strftime("%Y-%m-%d")
-
-    # Used to calculate total execution time
-    start_time = time.time()
-
-    # Build the URL after getting all the arguments
-    url = RECHTSPRAAK_API_BASE_URL + 'max=' + str(amount) + '&date=' + starting_date + '&date=' + ending_date
-
-    print("Checking the API")
-    # Check the working of API
-    response_code = check_api(url)
-    if response_code == 200:
-        print("API is working fine!")
-        print("Getting " + str(amount) + " documents from " + starting_date + " till " + ending_date)
-
-        json_object = get_data_from_url(url)
-
-        if json_object:
-            # Get current time
-            current_time = datetime.now().strftime("%H-%M-%S")
-
-            # Build file name
-            file_name = 'rechtspraak_' + starting_date + '_' + ending_date + '_' + current_time
-
-            save_csv(json_object, file_name, save_file)
-            get_exe_time(start_time)
-
-            if save_file == 'n':
-                global rs_ecli_df, rs_title_df, rs_summary_df, rs_updated_df, rs_link_df
-                global_rs_df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link'])
-                global_rs_df['id'] = rs_ecli_df
-                global_rs_df['title'] = rs_title_df
-                global_rs_df['summary'] = rs_summary_df
-                global_rs_df['updated'] = rs_updated_df
-                global_rs_df['link'] = rs_link_df
-                print("Done")
-
-                # Clear the lists for the next usage
-                rs_ecli_df = []
-                rs_title_df = []
-                rs_summary_df = []
-                rs_updated_df = []
-                rs_link_df = []
-                return global_rs_df
-    else:
-        print(f"URL returned with a {response_code} error code")
diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py
deleted file mode 100644
index aa84043..0000000
--- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_functions.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import requests, glob, time
-from pathlib import Path
-import pandas as pd
-
-
-# Check whether the API is working or not and return with the response code
-def check_api(url):
-    response = requests.get(f"{url}")
-
-    # Return with the response code
-    return response.status_code
-
-
-# Reads all the CSV files in a folder and returns the list of files
-# It also has an optional parameter "exclude". By default, it's None. If you want to exclude files having a certain
-# word in the file name, you may give a value
-def read_csv(dir_name, exclude=None):
-    path = dir_name
-    csv_files = glob.glob(path + "/*.csv")
-    files = []
-    for i in csv_files:
-        if exclude is not None:
-            if exclude not in i:
-                files.append(i)
-        else:
-            files.append(i)
-
-    print("Found " + str(len(files)) + " CSV file(s)\n")
-    return files
-
-
-# Get total execution time
-def get_exe_time(start_time):
-    end_time = time.time()
-    sec = end_time - start_time
-    mins = sec // 60
-    sec = sec % 60
-    hours = mins // 60
-    mins = mins % 60
-    print("Total execution time: {0}:{1}:{2}".format(int(hours), int(mins), round(sec, 2)))
-    print("\n")
diff --git a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py b/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py
deleted file mode 100644
index 2cf9cbd..0000000
--- a/rechtspraak/rechtspraak_extractor/tests/rechtspraak_metadata.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# This file is used for getting the metadata of the ECLIs obtained using rechspraak_api file. This file takes all the
-# CSV file created by rechspraak_api, picks up ECLIs and links column, and using an API gets the metadata and saves it
-# in another CSV file with metadata suffix.
-# This happens in async manner.
-import pathlib
-import os
-import urllib
-import multiprocessing
-from bs4 import BeautifulSoup
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
-import platform
-import shutil
-
-from rechtspraak_extractor.rechtspraak_functions import *
-
-# Define base url
-RECHTSPRAAK_METADATA_API_BASE_URL = "https://uitspraken.rechtspraak.nl/InzienDocument?id="
-
-# Define empty lists where we'll store our data temporarily
-ecli_df = []
-uitspraak_df = []
-instantie_df = []
-datum_uitspraak_df = []
-datum_publicatie_df = []
-zaaknummer_df = []
-rechtsgebieden_df = []
-bijzondere_kenmerken_df = []
-inhoudsindicatie_df = []
-vindplaatsen_df = []
-
-threads = []
-max_workers = 0
-
-
-def get_cores():
-    # max_workers is the number of concurrent processes supported by your CPU multiplied by 5.
-    # You can change it as per the computing power.
-    # Different python versions treat this differently. This is written as per python 3.6.
-    n_cores = multiprocessing.cpu_count()
-
-    global max_workers
-    max_workers = n_cores * 5
-    # If the main process is computationally intensive: Set to the number of logical CPU cores minus one.
-
-    print(f"Maximum " + str(max_workers) + " threads supported by your machine.")
-
-
-def extract_data_from_html(filename):
-    soup = BeautifulSoup(open("temp_rs_data/" + filename), "html.parser")
-    return soup
-
-
-def get_data_from_api(ecli_id):
-    url = RECHTSPRAAK_METADATA_API_BASE_URL + ecli_id
-    response_code = check_api(url)
-    global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \
-        rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df
-    try:
-        if response_code == 200:
-            try:
-                # Create HTML file
-                # html_file = ecli_id + ".html"
-                html_file = ecli_id.replace(":", "-") + ".html"
-                urllib.request.urlretrieve(url, "temp_rs_data/" + html_file)
-
-                # Extract data from HTML
-                html_object = extract_data_from_html(html_file)
-
-                soup = BeautifulSoup(str(html_object), features='lxml')
-
-                # Get the data
-                uitspraak_info = soup.find_all("div", {"class": "uitspraak-info"})
-                section = soup.find_all("div", {"class": "section"})
-
-                # We're using temporary variable "temp" to get the other metadata information such as instantie,
-                # datum uitspraak, datum publicatie, zaaknummer, rechtsgebieden, bijzondere kenmerken,
-                # inhoudsindicatie, and vindplaatsen
-                temp = soup.find_all("dl", {"class": "dl-horizontal"})
-                instantie = BeautifulSoup(str(temp[0]('dd')[0]), features='lxml').get_text().strip()
-                datum_uitspraak = BeautifulSoup(str(temp[0]('dd')[1]), features='lxml').get_text().strip()
-                datum_publicatie = BeautifulSoup(str(temp[0]('dd')[2]), features='lxml').get_text().strip()
-                zaaknummer = BeautifulSoup(str(temp[0]('dd')[3]), features='lxml').get_text().strip()
-                rechtsgebieden = BeautifulSoup(str(temp[0]('dd')[4]), features='lxml').get_text().strip()
-                bijzondere_kenmerken = BeautifulSoup(str(temp[0]('dd')[5]), features='lxml').get_text().strip()
-                inhoudsindicatie = BeautifulSoup(str(temp[0]('dd')[6]), features='lxml').get_text().strip()
-                vindplaatsen = BeautifulSoup(str(temp[0]('dd')[7]), features='lxml').get_text().strip()
-
-                uitspraak = BeautifulSoup(str(uitspraak_info), features='lxml').get_text()
-                uitspraak = uitspraak + BeautifulSoup(str(section), features='lxml').get_text()
-
-                ecli_df.append(ecli_id)
-                uitspraak_df.append(uitspraak)
-                instantie_df.append(instantie)
-                datum_uitspraak_df.append(datum_uitspraak)
-                datum_publicatie_df.append(datum_publicatie)
-                zaaknummer_df.append(zaaknummer)
-                rechtsgebieden_df.append(rechtsgebieden)
-                bijzondere_kenmerken_df.append(bijzondere_kenmerken)
-                inhoudsindicatie_df.append(inhoudsindicatie)
-                vindplaatsen_df.append(vindplaatsen)
-
-                del uitspraak, instantie, datum_uitspraak, datum_publicatie, zaaknummer, rechtsgebieden, \
-                    bijzondere_kenmerken, inhoudsindicatie, vindplaatsen
-
-                # BS4 creates an HTML file to get the data. Remove the file after use
-                if os.path.exists("temp_rs_data/" + html_file):
-                    os.remove("temp_rs_data/" + html_file)
-                urllib.request.urlcleanup()
-
-            except urllib.error.URLError as e:
-                print(e)
-            except urllib.error.HTTPError as e:
-                print(e)
-            except Exception as e:
-                print(e)
-        else:
-            ecli_df.append(ecli_id)
-            uitspraak_df.append("API returned with error code: " + str(response_code))
-    except requests.exceptions.RequestException as e:
-        raise SystemExit(e)
-
-
-def get_rechtspraak_metadata(save_file='n', dataframe=None, filename=None):
-    if dataframe is not None and filename is not None:
-        print(f"Please provide either a dataframe or a filename, but not both")
-        return False
-
-    if dataframe is None and filename is None and save_file == 'n':
-        print(f"Please provide at least a dataframe of filename when the save_file is \"n\"")
-        return False
-
-    print("Rechtspraak metadata API")
-
-    start_time = time.time()  # Get start time
-
-    no_of_rows = ''
-    rs_data = ''
-    csv_files = 0
-
-    # Check if dataframe is provided and is correct
-    if dataframe is not None:
-        if 'id' in dataframe and 'link' in dataframe:
-            rs_data = dataframe
-            no_of_rows = rs_data.shape[0]
-        else:
-            print("Dataframe is corrupted or does not contain necessary information to get the metadata.")
-            return False
-
-    # Check if filename is provided and is correct
-    if filename is not None:
-        print("Reading " + filename + " from data folder")
-        file_check = pathlib.Path("data/" + filename)
-        if file_check.is_file():
-            print("File found. Checking if metadata already exists")
-            # Check if metadata already exists
-            file_check = Path("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4]
-                              + "_metadata.csv")
-            if file_check.is_file():
-                print("Metadata for " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] +
-                      ".csv already exists.")
-                return False
-            else:
-                rs_data = pd.read_csv('data/' + filename)
-                if 'id' in rs_data and 'link' in rs_data:
-                    no_of_rows = rs_data.shape[0]
-                else:
-                    print("File is corrupted or does not contain necessary information to get the metadata.")
-                    return False
-        else:
-            print("File not found. Please check the file name.")
-            return False
-
-    get_cores()  # Get number of cores supported by the CPU
-
-    if dataframe is None and filename is None and save_file == 'y':
-        print("No dataframe or file name is provided. Getting the metadata of all the files present in the "
-              "data folder")
-
-        print("Reading all CSV files in the data folder...")
-        csv_files = read_csv('data', "metadata")
-
-        global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \
-            rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df
-
-        if len(csv_files) > 0 and save_file == 'y':
-            for f in csv_files:
-                # Create empty dataframe
-                rsm_df = pd.DataFrame(columns=['ecli_id', 'uitspraak', 'instantie', 'datum_uitspraak',
-                                               'datum_publicatie', 'zaaknummer', 'rechtsgebieden',
-                                               'bijzondere_kenmerken', 'inhoudsindicatie', 'vindplaatsen'])
-
-                # Check if file already exists
-                file_check = Path("data/" + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv")
-                if file_check.is_file():
-                    print("Metadata for " + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + ".csv already exists.")
-                    continue
-
-                df = pd.read_csv(f)
-                no_of_rows = df.shape[0]
-                print("Getting metadata of " + str(no_of_rows) + " ECLIs from " +
-                      f.split('/')[-1][:len(f.split('/')[-1]) - 4] + ".csv")
-                print("Working. Please wait...")
-
-                # Get all ECLIs in a list
-                ecli_list = list(df.loc[:, 'id'])
-
-                # Create a temporary directory to save files
-                Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
-                with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                    for ecli in ecli_list:
-                        threads.append(executor.submit(get_data_from_api, ecli))
-
-                # Delete temporary directory
-                shutil.rmtree('temp_rs_data')
-                # executor.shutdown()  # Shutdown the executor
-
-                # Save CSV file
-                print("Creating CSV file...")
-
-                rsm_df['ecli_id'] = ecli_df
-                rsm_df['uitspraak'] = uitspraak_df
-                rsm_df['instantie'] = instantie_df
-                rsm_df['datum_uitspraak'] = datum_uitspraak_df
-                rsm_df['datum_publicatie'] = datum_publicatie_df
-                rsm_df['zaaknummer'] = zaaknummer_df
-                rsm_df['rechtsgebieden'] = rechtsgebieden_df
-                rsm_df['bijzondere_kenmerken'] = bijzondere_kenmerken_df
-                rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
-                rsm_df['vindplaatsen'] = vindplaatsen_df
-
-                # Create directory if not exists
-                Path('data').mkdir(parents=True, exist_ok=True)
-
-                rsm_df.to_csv("data/" + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv",
-                              index=False, encoding='utf8')
-                print("CSV file " + f.split('\\')[-1][:len(f.split('\\')[-1]) - 4] + "_metadata.csv" +
-                      " successfully created.\n")
-
-                # Clear the lists for the next file
-                ecli_df = []
-                uitspraak_df = []
-                instantie_df = []
-                datum_uitspraak_df = []
-                datum_publicatie_df = []
-                zaaknummer_df = []
-                rechtsgebieden_df = []
-                bijzondere_kenmerken_df = []
-                inhoudsindicatie_df = []
-                vindplaatsen_df = []
-                ecli_list = []
-                del rsm_df
-            return True
-
-    if rs_data is not None:
-        rsm_df = pd.DataFrame(columns=['ecli_id', 'uitspraak', 'instantie', 'datum_uitspraak', 'datum_publicatie',
-                                       'zaaknummer', 'rechtsgebieden', 'bijzondere_kenmerken', 'inhoudsindicatie',
-                                       'vindplaatsen'])
-
-        print("Getting metadata of " + str(no_of_rows) + " ECLIs")
-        print("Working. Please wait...")
-        # Get all ECLIs in a list
-        ecli_list = list(rs_data.loc[:, 'id'])
-
-        # Create a temporary directory to save files
-        Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
-
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            for ecli in ecli_list:
-                threads.append(executor.submit(get_data_from_api, ecli))
-
-        # Delete temporary directory
-        shutil.rmtree('temp_rs_data')
-
-        # global ecli_df, uitspraak_df, instantie_df, datum_uitspraak_df, datum_publicatie_df, zaaknummer_df, \
-        #     rechtsgebieden_df, bijzondere_kenmerken_df, inhoudsindicatie_df, vindplaatsen_df
-
-        rsm_df['ecli_id'] = ecli_df
-        rsm_df['uitspraak'] = uitspraak_df
-        rsm_df['instantie'] = instantie_df
-        rsm_df['datum_uitspraak'] = datum_uitspraak_df
-        rsm_df['datum_publicatie'] = datum_publicatie_df
-        rsm_df['zaaknummer'] = zaaknummer_df
-        rsm_df['rechtsgebieden'] = rechtsgebieden_df
-        rsm_df['bijzondere_kenmerken'] = bijzondere_kenmerken_df
-        rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
-        rsm_df['vindplaatsen'] = vindplaatsen_df
-
-        if save_file == 'y':
-            if filename is None or filename == '':
-                filename = "custom_rechtspraak_" + datetime.now().strftime("%H-%M-%S") + ".csv"
-            # Create directory if not exists
-            Path('data').mkdir(parents=True, exist_ok=True)
-
-            rsm_df.to_csv("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv",
-                          index=False, encoding='utf8')
-            print("CSV file " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv" +
-                  " successfully created.\n")
-
-        # Clear the lists for the next file
-        ecli_df = []
-        uitspraak_df = []
-        instantie_df = []
-        datum_uitspraak_df = []
-        datum_publicatie_df = []
-        zaaknummer_df = []
-        rechtsgebieden_df = []
-        bijzondere_kenmerken_df = []
-        inhoudsindicatie_df = []
-        vindplaatsen_df = []
-        ecli_list = []
-
-        get_exe_time(start_time)
-
-        if save_file == 'n':
-            return rsm_df
-
-        return True
-
diff --git a/rechtspraak/setup.py b/rechtspraak/setup.py
deleted file mode 100644
index add6313..0000000
--- a/rechtspraak/setup.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# This file is required to create a python library
-
-from setuptools import find_packages, setup
-from pathlib import Path
-
-p = Path("README.md")
-long_descr = p.read_text()
-
-setup(
-    name='rechtspraak_extractor',
-    packages=find_packages(include=['rechtspraak_extractor']),
-    version='1.1.17',
-    description='Library for extracting rechtspraak data',
-    author='LawTech Lab',
-    license='MIT',
-    install_requires=['bs4', 'lxml==4.6.3', 'requests==2.26.0', 'xmltodict==0.13.0', 'python_dotenv==0.15.0', 'pandas','tqdm'],
-    author_email='pranav.bapat@student.maastrichtuniversity.nl',
-    keywords=['rechtspraak', 'extractor', 'rechtspraak extractor'],
-    long_description=long_descr,
-    long_description_content_type='text/markdown',
-    project_urls={
-        "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries",
-        "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries",
-    },
-)
\ No newline at end of file
diff --git a/tests.py b/tests.py
index adcb207..4732cdb 100644
--- a/tests.py
+++ b/tests.py
@@ -1,26 +1,4 @@
 from cellar_extractor import *
-from echr_extractor import *
-from rechtspraak_extractor import *
-
-def echr_y():
-    get_echr(save_file='y',count=100,start_date='2022-01-01')
-
-def echr_n():
-    get_echr(save_file='n',count=100,start_date='2022-01-01')
-
-def echr_extra_y():
-    get_echr_extra(save_file='n',count=100,start_date='2022-01-01')
-
-def echr_extra_n():
-    get_echr_extra(save_file='n',count=100,start_date='2022-01-01')
-
-def rechtspraak_n():
-    df = get_rechtspraak(max_ecli=100,sd='2022-01-01',save_file='n')
-    get_rechtspraak_metadata(save_file='n',dataframe=df)
-
-def rechtspraak_y():
-    df = get_rechtspraak(max_ecli=100,sd='2022-01-01',save_file='y')
-    get_rechtspraak_metadata(save_file='y',dataframe=df)
 
 def cellar_csv_n():
     get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100)
@@ -92,46 +70,3 @@ def test_cellar_json_n():
         assert True
     except Exception:
         assert False, "Downloading cellar as json failed."
-
-
-def test_echr_extra_y():
-    try:
-        echr_extra_y()
-        assert True
-    except Exception:
-        assert False, "Saving extra echr failed"
-
-def test_echr_extra_n():
-    try:
-        echr_extra_n()
-        assert True
-    except Exception:
-        assert False, "Downloading extra echr failed"
-
-def test_echr_y():
-    try:
-        echr_y()
-        assert True
-    except Exception:
-        assert False, "Saving echr failed"
-
-def test_echr_n():
-    try:
-        echr_n()
-        assert True
-    except Exception:
-        assert False, "Downloading echr failed"
-
-def test_rechtspraak_y():
-    try:
-        rechtspraak_y()
-        assert True
-    except Exception:
-        assert False, "Saving extra rechtspraak failed"
-
-def test_rechtspraak_n():
-    try:
-        rechtspraak_n()
-        assert True
-    except Exception:
-        assert False, "Downloading extra rechtspraak failed"

From 6231dd6bfb060b4c832ba9e41959297cd3fcd2a0 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 15 Jul 2023 15:14:18 +0200
Subject: [PATCH 02/11] fixing cellar pandas error

---
 cellar/cellar_extractor/fulltext_saving.py | 2 +-
 cellar/setup.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py
index 44af01c..c4895db 100644
--- a/cellar/cellar_extractor/fulltext_saving.py
+++ b/cellar/cellar_extractor/fulltext_saving.py
@@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None):
 def add_column_frow_list(data, name, list):
     column = pd.Series([], dtype='string')
     for l in list:
-        column = column.append(l)
+        column = pd.concart(column,l)
     column.sort_index(inplace=True)
     data.insert(1, name, column)
diff --git a/cellar/setup.py b/cellar/setup.py
index 4a4a614..cc3037e 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.50',
+    version='1.0.51',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From 808fd595177c40bbc0ceb1fb07e156fc7d8f8036 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 15 Jul 2023 15:20:17 +0200
Subject: [PATCH 03/11] fixing cellar pandas error #2

---
 cellar/cellar_extractor/fulltext_saving.py | 2 +-
 cellar/setup.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py
index c4895db..49fe4d1 100644
--- a/cellar/cellar_extractor/fulltext_saving.py
+++ b/cellar/cellar_extractor/fulltext_saving.py
@@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None):
 def add_column_frow_list(data, name, list):
     column = pd.Series([], dtype='string')
     for l in list:
-        column = pd.concart(column,l)
+        column = pd.concat(column,l)
     column.sort_index(inplace=True)
     data.insert(1, name, column)
diff --git a/cellar/setup.py b/cellar/setup.py
index cc3037e..7394c42 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.51',
+    version='1.0.52',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From b4071f0603fbeae6e58a170ac1b0c7dec354d9b1 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 15 Jul 2023 15:25:46 +0200
Subject: [PATCH 04/11] fixing cellar pandas error #3

---
 cellar/cellar_extractor/fulltext_saving.py | 2 +-
 cellar/setup.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py
index 49fe4d1..8bb03c3 100644
--- a/cellar/cellar_extractor/fulltext_saving.py
+++ b/cellar/cellar_extractor/fulltext_saving.py
@@ -173,6 +173,6 @@ def add_sections(data, threads, json_filepath=None):
 def add_column_frow_list(data, name, list):
     column = pd.Series([], dtype='string')
     for l in list:
-        column = pd.concat(column,l)
+        column = pd.concat([column,l])
     column.sort_index(inplace=True)
     data.insert(1, name, column)
diff --git a/cellar/setup.py b/cellar/setup.py
index 7394c42..5473f4f 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.52',
+    version='1.0.53',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From 7d433121f9009ccd7d49e5b5c9cf2cfa67bbc7e8 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Thu, 3 Aug 2023 11:58:41 +0200
Subject: [PATCH 05/11] filtering by subject matter method added

---
 cellar/README.md                         |  9 ++++++
 cellar/cellar_extractor/Testing_file.py  |  3 +-
 cellar/cellar_extractor/__init__.py      |  3 +-
 cellar/cellar_extractor/cellar.py        | 36 ++++++++++++++++--------
 cellar/cellar_extractor/csv_extractor.py |  1 +
 cellar/setup.py                          |  2 +-
 6 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/cellar/README.md b/cellar/README.md
index c870daa..6447a9d 100644
--- a/cellar/README.md
+++ b/cellar/README.md
@@ -57,6 +57,8 @@ Python 3.9
     <li><code>get_nodes_and_edges_lists</code></li>
     Gets 2 list objects, one for the nodes and edges of the citations within the passed dataframe.
     Allows the creation of a network graph of the citations. Can only be returned in-memory.
+    <li><code>filter_subject_matter</code></li>
+    Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases.
     <br>
 </ol>
 
@@ -103,6 +105,13 @@ Python 3.9
         DataFrame of cellar metadata acquired from the get_cellar_extra method with eurlex webservice credentials passed.
         This method will only work on dataframes with citations data.
     </ul>
+    <li><code>filter_subject_matter</code></li>
+    <ul>
+        <li><strong>df: DataFrame object, required, default None</strong></li>
+        DataFrame of cellar metadata acquired from any of the cellar extraction methods listed above.
+        <li><strong>phrase: string, required, default None</strong></li>
+        The phrase which has to be present in the subject matter of cases. Case insensitive.
+    </ul>
 </ol>
 
 
diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py
index d2d81b3..8204c52 100644
--- a/cellar/cellar_extractor/Testing_file.py
+++ b/cellar/cellar_extractor/Testing_file.py
@@ -25,5 +25,6 @@
    cits = get_citations_with_extra_info(text)
    print(cits)
    data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n')
-   nodes_edges = get_nodes_and_edges_lists(data)
+   d3 = filter_subject_matter(data, "prices")
+   b=2
    pass
\ No newline at end of file
diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py
index 3f96902..7821b0c 100644
--- a/cellar/cellar_extractor/__init__.py
+++ b/cellar/cellar_extractor/__init__.py
@@ -1,3 +1,4 @@
 from cellar_extractor.cellar import get_cellar
 from cellar_extractor.cellar import get_cellar_extra
-from cellar_extractor.cellar import get_nodes_and_edges_lists
\ No newline at end of file
+from cellar_extractor.cellar import get_nodes_and_edges_lists
+from cellar_extractor.cellar import filter_subject_matter
diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index 24c4a67..4dac929 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -1,14 +1,16 @@
 import json
 import os
-from os.path import join
+import time
 from datetime import datetime
 from pathlib import Path
+
 from tqdm import tqdm
+
+from cellar_extractor.cellar_extra_extract import extra_cellar
 from cellar_extractor.cellar_queries import get_all_eclis, get_raw_cellar_metadata
 from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning
-from cellar_extractor.cellar_extra_extract import extra_cellar
 from cellar_extractor.nodes_and_edges import get_nodes_and_edges
-import time
+
 
 def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'):
     if not ed:
@@ -28,7 +30,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
         return False
     all_eclis = {}
     concurrent_docs = 100
-    for i in tqdm(range(0, len(eclis), concurrent_docs),colour="GREEN"):
+    for i in tqdm(range(0, len(eclis), concurrent_docs), colour="GREEN"):
         new_eclis = get_raw_cellar_metadata(eclis[i:(i + concurrent_docs)])
         all_eclis = {**all_eclis, **new_eclis}
     if save_file == 'y':
@@ -62,23 +64,35 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre
     file_path = os.path.join('data', file_name + '.csv')
     if save_file == 'y':
         Path('data').mkdir(parents=True, exist_ok=True)
-        extra_cellar(data = data ,filepath=file_path, threads=threads, username=username, password=password)
+        extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password)
         print("\n--- DONE ---")
 
     else:
-        data,json = extra_cellar(data= data, threads = threads, username= username,password=password)
+        data, json = extra_cellar(data=data, threads=threads, username=username, password=password)
         print("\n--- DONE ---")
 
-        return data,json
+        return data, json
 
-def get_nodes_and_edges_lists(df = None):
+
+def get_nodes_and_edges_lists(df=None):
     if df is None:
         print("No dataframe passed!")
         return
     else:
         try:
-            nodes,edges = get_nodes_and_edges(df)
+            nodes, edges = get_nodes_and_edges(df)
         except:
             print('Something went wrong. Nodes and edges creation unsuccessful.')
-            return False,False
-        return nodes,edges
+            return False, False
+        return nodes, edges
+
+
+def filter_subject_matter(df=None, phrase=None):
+    if df is None or phrase is None:
+        print("Incorrect input values! \n Returning... \n")
+    else:
+        try:
+            mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase)
+            return df[mask]
+        except:
+            print("Something went wrong!\n Returning... \n")
diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py
index 23ee71c..b623aac 100644
--- a/cellar/cellar_extractor/csv_extractor.py
+++ b/cellar/cellar_extractor/csv_extractor.py
@@ -24,6 +24,7 @@ def extract_rows(data, number):
     print("")
     print("EXTRACTION FROM CSV FILE IN DATA PROCESSED DIR STARTED")
     print("")
+    DIR_DATA_RAW=''
     csv_files = (glob.glob(DIR_DATA_RAW + "/" + "*.csv"))
     print(f"FOUND {len(csv_files)} CSV FILES")
 
diff --git a/cellar/setup.py b/cellar/setup.py
index 5473f4f..2ad4544 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.53',
+    version='1.0.54',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From ec3829eaac04652873583a30714c9f7729b967d8 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Fri, 4 Aug 2023 15:41:19 +0200
Subject: [PATCH 06/11] new nodes edges functionality

---
 cellar/README.md                           |  3 +++
 cellar/cellar_extractor/cellar.py          | 15 ++++++-------
 cellar/cellar_extractor/nodes_and_edges.py | 26 +++++++++++++---------
 cellar/setup.py                            |  2 +-
 4 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/cellar/README.md b/cellar/README.md
index 6447a9d..acb6b80 100644
--- a/cellar/README.md
+++ b/cellar/README.md
@@ -104,6 +104,9 @@ Python 3.9
         <li><strong>df: DataFrame object, required, default None</strong></li>
         DataFrame of cellar metadata acquired from the get_cellar_extra method with eurlex webservice credentials passed.
         This method will only work on dataframes with citations data.
+        <li><strong>only_local: boolean, optional, default False</strong></li>
+        Flag for nodes and edges generation. If set to True, the network created will only include nodes and edges between 
+        cases exclusively inside the given dataframe.
     </ul>
     <li><code>filter_subject_matter</code></li>
     <ul>
diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index 4dac929..fb197cd 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -74,17 +74,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre
         return data, json
 
 
-def get_nodes_and_edges_lists(df=None):
+def get_nodes_and_edges_lists(df=None, only_local=False):
     if df is None:
         print("No dataframe passed!")
         return
-    else:
-        try:
-            nodes, edges = get_nodes_and_edges(df)
-        except:
-            print('Something went wrong. Nodes and edges creation unsuccessful.')
-            return False, False
-        return nodes, edges
+    try:
+        nodes, edges = get_nodes_and_edges(df,only_local)
+    except:
+        print('Something went wrong. Nodes and edges creation unsuccessful.')
+        return False, False
+    return nodes, edges
 
 
 def filter_subject_matter(df=None, phrase=None):
diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py
index 9578c36..13adf6b 100644
--- a/cellar/cellar_extractor/nodes_and_edges.py
+++ b/cellar/cellar_extractor/nodes_and_edges.py
@@ -5,7 +5,7 @@ def extract_containing_subject_matter(df,phrase):
 def get_df_with_celexes(df,celexes):
     returner = df[df['CELEX IDENTIFIER'].isin(celexes)]
     return returner
-def get_edges_list(df):
+def get_edges_list(df,only_local):
     extraction = df[['CELEX IDENTIFIER','citing']]
     extraction.reset_index(inplace=True)
     keys = extraction['CELEX IDENTIFIER']
@@ -15,16 +15,20 @@ def get_edges_list(df):
     for i in range(len(keys)):
         k = keys[i]
         val = vals[i]
-        if val == val:
-            nodes.add(str(k))
-            val_unpacked = val.split(";")
-            for val in val_unpacked:
-                nodes.add(str(val))
-                edges.append(str(k)+','+str(val))
-        else:
-            pass
+        if val != val:
+            continue
+        nodes.add(str(k))
+        val_unpacked = val.split(";")
+        for val in val_unpacked:
+            if only_local and val not in keys:
+                continue
+            nodes.add(str(val))
+            edges.append(str(k)+','+str(val))
+
+    nodes = list(nodes)
+
     return edges, list(nodes)
-def get_nodes_and_edges(df):
-    edges, nodes = get_edges_list(df)
+def get_nodes_and_edges(df,only_local):
+    edges, nodes = get_edges_list(df,only_local)
     #nodes = get_df_with_celexes(df,celexes)
     return nodes,edges
\ No newline at end of file
diff --git a/cellar/setup.py b/cellar/setup.py
index 2ad4544..0507f5f 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.54',
+    version='1.0.55',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From cbc244615b0f8af6cf3f0c62f0e924f88c17e934 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 5 Aug 2023 18:11:58 +0200
Subject: [PATCH 07/11] small fix

---
 cellar/cellar_extractor/cellar.py | 6 ++++--
 cellar/setup.py                   | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index fb197cd..820acc8 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -91,7 +91,9 @@ def filter_subject_matter(df=None, phrase=None):
         print("Incorrect input values! \n Returning... \n")
     else:
         try:
-            mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase)
+            mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase,na=False)
             return df[mask]
-        except:
+        except Exception as e:
+            print(e)
             print("Something went wrong!\n Returning... \n")
+            return None
diff --git a/cellar/setup.py b/cellar/setup.py
index 0507f5f..66cd9bc 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.55',
+    version='1.0.57',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From b007556ba25c1f023bfd0a331085171c9b967ea9 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Sat, 12 Aug 2023 14:57:42 +0200
Subject: [PATCH 08/11] small fix

---
 cellar/cellar_extractor/Testing_file.py | 2 +-
 cellar/cellar_extractor/cellar.py       | 2 +-
 cellar/setup.py                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py
index 8204c52..412e4e8 100644
--- a/cellar/cellar_extractor/Testing_file.py
+++ b/cellar/cellar_extractor/Testing_file.py
@@ -24,7 +24,7 @@
    text = get_full_text_from_html(site)
    cits = get_citations_with_extra_info(text)
    print(cits)
-   data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n')
+   data = get_cellar(sd='2023-01-01',max_ecli=100,save_file='n')
    d3 = filter_subject_matter(data, "prices")
    b=2
    pass
\ No newline at end of file
diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index 820acc8..3d12be0 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -91,7 +91,7 @@ def filter_subject_matter(df=None, phrase=None):
         print("Incorrect input values! \n Returning... \n")
     else:
         try:
-            mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase,na=False)
+            mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase.lower(), na=False)
             return df[mask]
         except Exception as e:
             print(e)
diff --git a/cellar/setup.py b/cellar/setup.py
index 66cd9bc..31ea5fd 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.57',
+    version='1.0.58',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From 4e9e4b366b63086839370328655ba96fd1f108eb Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Thu, 17 Aug 2023 12:34:19 +0200
Subject: [PATCH 09/11] update

---
 cellar/cellar_extractor/nodes_and_edges.py | 4 ++--
 cellar/setup.py                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py
index 13adf6b..087c798 100644
--- a/cellar/cellar_extractor/nodes_and_edges.py
+++ b/cellar/cellar_extractor/nodes_and_edges.py
@@ -8,8 +8,8 @@ def get_df_with_celexes(df,celexes):
 def get_edges_list(df,only_local):
     extraction = df[['CELEX IDENTIFIER','citing']]
     extraction.reset_index(inplace=True)
-    keys = extraction['CELEX IDENTIFIER']
-    vals = extraction['citing']
+    keys = extraction['CELEX IDENTIFIER'].tolist()
+    vals = extraction['citing'].tolist()
     nodes = set()
     edges = list()
     for i in range(len(keys)):
diff --git a/cellar/setup.py b/cellar/setup.py
index 31ea5fd..87cbb16 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.58',
+    version='1.0.59',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From 8116bc6f1fcdc634a9cee2a1bf4ed075b79aab08 Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Wed, 27 Sep 2023 14:46:01 +0200
Subject: [PATCH 10/11] logging for cellar

---
 cellar/cellar_extractor/cellar.py          | 32 +++++++++++-----------
 cellar/cellar_extractor/citations_adder.py | 15 +++++-----
 cellar/cellar_extractor/csv_extractor.py   |  3 +-
 cellar/cellar_extractor/json_to_csv.py     | 13 +++++----
 cellar/setup.py                            |  2 +-
 5 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index 3d12be0..b108844 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -3,7 +3,7 @@
 import time
 from datetime import datetime
 from pathlib import Path
-
+import logging
 from tqdm import tqdm
 
 from cellar_extractor.cellar_extra_extract import extra_cellar
@@ -17,16 +17,16 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
         ed = datetime.now().isoformat(timespec='seconds')
     file_name = 'cellar_' + sd + '_' + ed
     file_name = file_name.replace(":", "_")
-    print('\n--- PREPARATION ---\n')
-    print(f'Starting from specified start date: {sd}')
-    print(f'Up until the specified end date {ed}')
+    logging.info('\n--- PREPARATION ---\n')
+    logging.info(f'Starting from specified start date: {sd}')
+    logging.info(f'Up until the specified end date {ed}')
     eclis = get_all_eclis(starting_date=sd, ending_date=ed)
-    print(f"Found {len(eclis)} ECLIs")
+    logging.info(f"Found {len(eclis)} ECLIs")
     time.sleep(1)
     if len(eclis) > max_ecli:
         eclis = eclis[:max_ecli]
     if len(eclis) == 0:
-        print(f"No data to download found between {sd} and {ed}")
+        logging.info(f"No data to download found between {sd} and {ed}")
         return False
     all_eclis = {}
     concurrent_docs = 100
@@ -48,7 +48,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
             return df
         else:
             return all_eclis
-    print("\n--- DONE ---")
+    logging.info("\n--- DONE ---")
 
 
 def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""):
@@ -56,44 +56,44 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre
         ed = datetime.now().isoformat(timespec='seconds')
     data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv')
     if data is False:
-        print("Cellar extraction unsuccessful")
+        logging.warning("Cellar extraction unsuccessful")
         return False, False
-    print("\n--- START OF EXTRA EXTRACTION ---")
+    logging.info("\n--- START OF EXTRA EXTRACTION ---")
     file_name = 'cellar_extra_' + sd + '_' + ed
     file_name = file_name.replace(":", "_")
     file_path = os.path.join('data', file_name + '.csv')
     if save_file == 'y':
         Path('data').mkdir(parents=True, exist_ok=True)
         extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password)
-        print("\n--- DONE ---")
+        logging.info("\n--- DONE ---")
 
     else:
         data, json = extra_cellar(data=data, threads=threads, username=username, password=password)
-        print("\n--- DONE ---")
+        logging.info("\n--- DONE ---")
 
         return data, json
 
 
 def get_nodes_and_edges_lists(df=None, only_local=False):
     if df is None:
-        print("No dataframe passed!")
+        logging.warning("No dataframe passed!")
         return
     try:
         nodes, edges = get_nodes_and_edges(df,only_local)
     except:
-        print('Something went wrong. Nodes and edges creation unsuccessful.')
+        logging.warning('Something went wrong. Nodes and edges creation unsuccessful.')
         return False, False
     return nodes, edges
 
 
 def filter_subject_matter(df=None, phrase=None):
     if df is None or phrase is None:
-        print("Incorrect input values! \n Returning... \n")
+        logging.info("Incorrect input values! \n Returning... \n")
     else:
         try:
             mask = df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.lower().str.contains(phrase.lower(), na=False)
             return df[mask]
         except Exception as e:
-            print(e)
-            print("Something went wrong!\n Returning... \n")
+            logging.warning(e)
+            logging.warning("Something went wrong!\n Returning... \n")
             return None
diff --git a/cellar/cellar_extractor/citations_adder.py b/cellar/cellar_extractor/citations_adder.py
index 99de07d..ba47721 100644
--- a/cellar/cellar_extractor/citations_adder.py
+++ b/cellar/cellar_extractor/citations_adder.py
@@ -1,6 +1,7 @@
 import sys
 import threading
 import time
+import logging
 from io import StringIO
 from os.path import dirname, abspath
 import pandas as pd
@@ -111,7 +112,7 @@ def process_queries(link, celex):
                 response = run_eurlex_webservice_query(query, username, password)
                 if response.status_code == 500 and "WS_WS_CALLS_IDLE_INTERVAL" not in response.text:
                     perc=i*100/len(celexes)
-                    print(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded."
+                    logging.info(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." +
                           f"\nThere were {success} successful queries and {retry} retries")
                     return
                 elif "<numhits>0</numhits>" in response.text:
@@ -124,7 +125,7 @@ def process_queries(link, celex):
                         failure = True
                     except:
                         retry+=1
-                        #print(response.content)
+                        #logging.info(response.content)
                         time.sleep(0.5)
             time.sleep(2)
     if len(normal_celex)>0:
@@ -225,24 +226,24 @@ def add_citations_separate_webservice(data, username, password):
     response = run_eurlex_webservice_query(query, username, password)
     if response.status_code == 500 :
         if "WS_MAXIMUM_NB_OF_WS_CALLS" in response.text:
-            print("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.")
+            logging.warning("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.")
             return
         else:
-            print("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) "
+            logging.warning("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " +
               "login credentials are different)")
             sys.exit(2)
     elif response.status_code == 403:
-        print("Webservice connection was blocked, eurlex might be going through maintenance right now.")
+        logging.info("Webservice connection was blocked, eurlex might be going through maintenance right now.")
         sys.exit(2)
     else:
-        print("Webservice connection was successful!")
+        logging.info("Webservice connection was successful!")
     time.sleep(1)
     dictionary_list = list()
     execute_citations_webservice(dictionary_list,celex,username,password)
     citing_dict = dict()
     for d in dictionary_list:
         citing_dict.update(d)
-    print("Webservice extraction finished, the rest of extraction will now happen.")
+    logging.info("Webservice extraction finished, the rest of extraction will now happen.")
     time.sleep(1) # It seemed to print out the length of dictionary wrong, even when it was equal to 1000.
     cited_dict = reverse_citing_dict(citing_dict)
 
diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py
index b623aac..d6a4527 100644
--- a/cellar/cellar_extractor/csv_extractor.py
+++ b/cellar/cellar_extractor/csv_extractor.py
@@ -1,5 +1,6 @@
 import glob
 import argparse
+import logging
 from cellar_extractor.json_to_csv import read_csv
 
 """
@@ -11,7 +12,7 @@ def extract_rows(data, number):
     try:
         output = data[1:number]
     except Exception:
-        print(f"The file does not have {number} entries, returning entire file.")
+        logging.info(f"The file does not have {number} entries, returning entire file.")
         output = data
     return output
 
diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py
index 1a1e62e..2781ae2 100644
--- a/cellar/cellar_extractor/json_to_csv.py
+++ b/cellar/cellar_extractor/json_to_csv.py
@@ -1,6 +1,7 @@
 import csv
 import re
 import warnings
+import logging
 from bs4 import BeautifulSoup
 import sys
 import pandas as pd
@@ -90,8 +91,8 @@ def read_csv(file_path):
         data = pd.read_csv(file_path, sep=",", encoding='utf-8')
         return data
     except Exception:
-        print("Something went wrong when trying to open the csv file!")
-        print(f" The path to the file was {file_path}")
+        logging.info("Something went wrong when trying to open the csv file!")
+        logging.info(f" The path to the file was {file_path}")
         sys.exit(2)
 
 
@@ -112,10 +113,10 @@ def json_to_csv_returning(json_data):
         if final_data:
             return create_csv_returning(final_data)
         else:
-            print("Error creating dataframe. Data is empty.")
+            logging.info("Error creating dataframe. Data is empty.")
             return False
     else:
-        print("Error reading json file. Please make sure json file exists and contains data.")
+        logging.info("Error reading json file. Please make sure json file exists and contains data.")
         return False
 
 
@@ -125,9 +126,9 @@ def json_to_csv_main(json_data, filepath):
         if final_data:
             create_csv(filepath=filepath, encoding="UTF8", data=final_data)
         else:
-            print("Error creating CSV file. Data is empty.")
+            logging.info("Error creating CSV file. Data is empty.")
             return False
     else:
-        print("Error reading json file. Please make sure json file exists and contains data.")
+        logging.info("Error reading json file. Please make sure json file exists and contains data.")
         return False
     return True
diff --git a/cellar/setup.py b/cellar/setup.py
index 87cbb16..11bb73c 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.59',
+    version='1.0.60',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',

From 560fc4d29c2cc211344952aadc2db5a54b6201af Mon Sep 17 00:00:00 2001
From: Piotr Lewandowski <pmlewandowski@gmail.com>
Date: Tue, 3 Oct 2023 10:46:37 +0200
Subject: [PATCH 11/11] logging fix

---
 cellar/cellar_extractor/__init__.py | 2 ++
 cellar/setup.py                     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py
index 7821b0c..39184aa 100644
--- a/cellar/cellar_extractor/__init__.py
+++ b/cellar/cellar_extractor/__init__.py
@@ -2,3 +2,5 @@
 from cellar_extractor.cellar import get_cellar_extra
 from cellar_extractor.cellar import get_nodes_and_edges_lists
 from cellar_extractor.cellar import filter_subject_matter
+import logging
+logging.basicConfig(level=logging.INFO)
\ No newline at end of file
diff --git a/cellar/setup.py b/cellar/setup.py
index 11bb73c..eec4dce 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='cellar_extractor',
     packages=find_packages(include=['cellar_extractor']),
-    version='1.0.60',
+    version='1.0.61',
     description='Library for extracting cellar data',
     author='LawTech Lab',
     license='MIT',