diff --git a/Dockerfile b/Dockerfile index e2007b6..c64e6cd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ENV POSTGRES_HOST=host.docker.internal ENV POSTGRES_PORT=5432 ENV POSTGRES_DB=postgres ENV POSTGRES_USER=postgres -ENV POSTGRES_PASSWORD=****** +ENV POSTGRES_PASSWORD=***** WORKDIR /home/link-liveliness-assessment @@ -39,4 +39,4 @@ EXPOSE 8000 USER linky -ENTRYPOINT [ "python3", "-m", "uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "8000" ] \ No newline at end of file +# ENTRYPOINT [ "python3", "-m", "uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "8000" ] diff --git a/README.md b/README.md index faf3951..677ce2d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# OGC API - Records; link liveliness assessment +# OGC API - Records; link liveliness assessment tool + +### Overview +The linkchecker component is designed to evaluate the validity and accuracy of links within metadata records in the OGC API - Records System. A component which evaluates for a set of metadata records (describing either data or knowledge sources), if: @@ -6,10 +9,9 @@ A component which evaluates for a set of metadata records (describing either dat - the links within the repository are valid - link metadata represents accurately the resource -The component either returns a http status: 200 (ok), 403 (non autorized), 404 (not found), 500 (error), ... -Status 302 is forwarded to new location and the test is repeated. +The component either returns a http status: 200 (ok), 401 (non autorized), 404 (not found), 500 (server error) -The component runs an evaluation for a single resource at request, or runs tests at intervals providing a history of availability +The component runs an evaluation for a single resource at request, or runs tests at intervals providing a history of availability A link either points to: @@ -21,13 +23,34 @@ If endpoint is API, some sanity checks can be performed on the API: - Identify if the API adopted any API-standard - IF an API standard is adopted, does the API support basic operations of that API + +The benefit of latter is that it provides more information then a simple ping to the index page of the API, typical examples of standardised API's are SOAP, +GraphQL, SPARQL, OpenAPI, WMS, WFS -The benefit of latter is that it provides more information then a simple ping to the index page of the API, typical examples of standardised API's are SOAP, GraphQL, SPARQL, OpenAPI, WMS, WFS - -## OGC API - records - -OGC is in the process of adopting the [OGC API - Records](https://github.com/opengeospatial/ogcapi-records) specification. A standardised API to interact with Catalogues. The specification includes a datamodel for metadata. This tool assesses the linkage section of any record in an OGC API - Records. - +***Sample response*** +``` + { + "id": 25, + "urlname": "https://demo.pycsw.org/gisdata/collections/metadata:main/queryables", + "parent_urls": [ + "https://demo.pycsw.org/gisdata/collections?f=html" + ], + "status": "200 OK", + "result": "", + "info": "True", + "warning": "", + "deprecated": null + } +``` +# OGC API - records +OGC is in the process of adopting the [OGC API - Records](https://github.com/opengeospatial/ogcapi-records) specification. +A standardised API to interact with Catalogues. The specification includes a datamodel for metadata. +This tool assesses the linkage section of any record in an OGC API - Records. + +OGC services (WMS, WFS, WCS, CSW) often return an HTTP 500 error or a 400 Bad Request when called without the necessary parameters. +This is because these services expect specific parameters to understand which operations to perform. +A handling for this URL formats has been done in order to detect and include the necessary parameters before being checked + Set the endpoint to be analysed as 2 environment variables ``` @@ -35,82 +58,71 @@ export OGCAPI_URL=https://soilwise-he.containers.wur.nl/cat/ export OGCAPI_COLLECTION=metadata:main ``` -## Source Code Brief Desrciption - -Running the linkchecker.py will utilize the requests library from python to get the relevant EJP Soil Catalogue source. -Run the command below -* python linkchecker.py -The urls selected from the requests will passed to linkchecker using the proper options. -The output generated will be written to a PostgreSQL database. -A .env is required to define the database connection parameters. -More specifically the following parameters must be specified - -``` - POSTGRES_HOST= - POSTGRES_PORT= - POSTGRES_DB= - POSTGRES_USER= - POSTGRES_PASSWORD= -``` - -## API -The api.py file creates a FastAPI in order to retrieve links statuses. -Run the command below: +## Api Key Features +1. **Link validation**: +Returns HTTP status codes for each link, along with other important information such as the parent URL, any warnings, and the date and time of the test. +![Fast API link_status](./images/link_status.png) +2. **Broken link categorization**: +Identifies and categorizes broken links based on status codes, including Redirection Errors, Client Errors, and Server Errors. +![Link categorization enpoint](./images/categorization.png) +3. **Deprecated links identification**: +Flags links as deprecated if they have failed for X consecutive tests, in our case X equals to 10. +Deprecated links are excluded from future tests to optimize performance. +![Fast API deprecated endpoint](./images/deprecated.png) +4. **Timeout management**: +Allows the identification of URLs that exceed a timeout threshold which can be set manually as a parameter in linkchecker's properties. +![Fast API timeout enpoint](./images/timeouts.png) +5. **Availability monitoring**: +When run periodically, the tool builds a history of availability for each URL, enabling users to view the status of links over time. +![Link validation enpoint](./images/val_history.png) + +## Container Deployment + +Set environment variables in Dockerfile to enable database connection. + +Run the following command: + +The app can be deployed as a container. +A docker-compose file has been implemented. + +Run ***docker-compose up*** to run the container + +***Helpful commands*** +To run the FastAPI locally run ``` python -m uvicorn api:app --reload --host 0.0.0.0 --port 8000 ``` -To view the service of the FastAPI on [http://127.0.0.1:8000/docs] - - -# Get current URL Status History -This endpoint returns the history of a specific URL. -Let say we have the status of a specific URL over time - -| id | url | validation_result | timestamp | -|-----|------------------------|-------------------|-------------------------| -| 1 | https://example.com | 200 OK | 2023-01-01 10:00:00+00 | -| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 | -| 3 | https://example.com | 200 OK | 2023-01-02 11:00:00+00 | -| 4 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 | -| 5 | https://wikipedia.com | 200 OK | 2023-01-02 11:00:10+00 | +The FastAPI service runs on: [http://127.0.0.1:8000/docs] only if ROOTPATH is not set -Running the `/Single_url_status_history` endpoint for the -https://wikipedia.com and setting limit = 2 it will fetch the following result: +## Deploy linky at a path +You can set ROOTPATH env var to run the api at a path (default is at root) -| id | url | validation_result | timestamp | -|-----|------------------------|-------------------|-------------------------| -| 1 | https://wikipedia.com | 500 Server Error | 2023-01-02 11:00:05+00 | -| 2 | https://wikipedia.com | 404 Not Found | 2023-01-01 10:00:05+00 | - -This is the URL's history in descenting order in datetime - -# Docker -======= -## Deploy `linky` at a path - -You can set `ROOTPATH` env var to run the api at a path (default is at root) - -``` export ROOTPATH=/linky -``` - -## Docker - -A Docker instance must be running for the linkchecker command to work. ## CI/CD -A workflow is provided in order to run it as a cronological job once per week every Sunday Midnight -(However currently it is commemended to save running minutes since it takes about 80 minutes to complete) -It is necessary to use the **secrets context in gitlab in order to be connected to database +A CI/CD configuration file is provided in order to create an automated chronological pipeline. +It is necessary to define the secrets context using GitLab secrets in order to connect to the database. ## Roadmap - ### GeoHealthCheck integration -[GeoHealthCheck](https://GeoHealthCheck.org) is a component to monitor livelyhood of typical OGC services (WMS, WFS, WCS, CSW). It is based on the [owslib](https://owslib.readthedocs.io/en/latest/) library, which provides a python implementation of various OGC services clients. +[GeoHealthCheck](https://GeoHealthCheck.org) is a component to monitor livelyhood of typical OGC services (WMS, WFS, WCS, CSW). +It is based on the [owslib](https://owslib.readthedocs.io/en/latest/) library, which provides a python implementation of various OGC services clients. -## Soilwise-he project +## Technological Stack -This work has been initiated as part of the [Soilwise-he project](https://soilwise-he.eu/). -The project receives funding from the European Union’s HORIZON Innovation Actions 2022 under grant agreement No. 101112838. +1. **Core Language**: + - Python: Used for the linkchecker, API, and database interactions. +2. **Database**: + - PostgreSQL: Utilized for storing and managing information. + +3. **Backend Framework**: + - FastAPI: Employed to create and expose REST API endpoints, utilizing its efficiency and auto-generated components like Swagger. + +4. **Containerization**: + - Docker: Used to containerize the linkchecker application, ensuring deployment and execution across different environments. + +## Soilwise-he project +This work has been initiated as part of the Soilwise-he project. +The project receives funding from the European Union’s HORIZON Innovation Actions 2022 under grant agreement No. 101112838. \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index e5fd3ef..7addd9c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,14 +4,14 @@ services: context: . dockerfile: Dockerfile container_name: linkchecker - command: python linkchecker.py - api: - build: - context: . - dockerfile: Dockerfile - container_name: FastApi - ports: - - "8000:8000" - command: python3 -m uvicorn api:app --host 0.0.0.0 --port 8000 --reload - depends_on: - - linkchecker + # command: python linkchecker.py + # api: + # build: + # context: . + # dockerfile: Dockerfile + # container_name: FastApi + # ports: + # - "8000:8000" + # command: python3 -m uvicorn api:app --host 0.0.0.0 --port 8000 --reload + # depends_on: + # - linkchecker diff --git a/images/categorization.png b/images/categorization.png new file mode 100644 index 0000000..2d78544 Binary files /dev/null and b/images/categorization.png differ diff --git a/images/deprecated.png b/images/deprecated.png new file mode 100644 index 0000000..1126217 Binary files /dev/null and b/images/deprecated.png differ diff --git a/images/link_status.png b/images/link_status.png new file mode 100644 index 0000000..d8421e8 Binary files /dev/null and b/images/link_status.png differ diff --git a/images/timeouts.png b/images/timeouts.png new file mode 100644 index 0000000..842302b Binary files /dev/null and b/images/timeouts.png differ diff --git a/images/val_history.png b/images/val_history.png new file mode 100644 index 0000000..b875867 Binary files /dev/null and b/images/val_history.png differ diff --git a/src/__pycache__/api.cpython-311.pyc b/src/__pycache__/api.cpython-311.pyc index 6f6b951..57b4e97 100644 Binary files a/src/__pycache__/api.cpython-311.pyc and b/src/__pycache__/api.cpython-311.pyc differ diff --git a/src/api.py b/src/api.py index 07d610b..35c32ea 100644 --- a/src/api.py +++ b/src/api.py @@ -34,21 +34,22 @@ class StatusResponse(BaseModel): id: int urlname: Optional[str] - parentname: Optional[str] - valid: Optional[str] + parent_urls: Optional[List[str]] + status: Optional[str] + result: Optional[str] + info: Optional[str] warning: Optional[str] + deprecated: Optional[bool] = None # Model to get the availability history of a specific url class URLAvailabilityResponse(BaseModel): - url: Optional[str] - perent_url: Optional[str] - validation_valid: Optional[str] - result: Optional[str] - warning: Optional[str] - lastChecked: Optional[datetime] - -class DeprecatedUrlsResponse(BaseModel): - url: Optional[str] + urlname: Optional[str] = None + status: Optional[str] = None + result: Optional[str] = None + info: Optional[str] = None + warning: Optional[str] = None + validation_valid: Optional[str] = None + last_checked: datetime # Define status lists REDIRECTION_STATUSES = [ @@ -80,150 +81,195 @@ async def fetch_data(query: str, values: dict = {}): try: return await database.fetch_all(query=query, values=values) except asyncpg.exceptions.UndefinedTableError: + logging.error("The specified table does not exist", exc_info=True) raise HTTPException(status_code=500, detail="The specified table does not exist") except Exception as e: - raise HTTPException(status_code=500, detail="Database query failed") from e - + logging.error(f"Database query failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail="Database query failed") + +# Endpoint to retrieve data with redirection statuses @app.get('/Redirection_URLs/3xx', response_model=List[StatusResponse]) async def get_redirection_statuses(): - query = "SELECT DISTINCT * FROM linkchecker_output WHERE valid = ANY(:statuses)" + query = """ + SELECT + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls + FROM + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE + l.status ILIKE ANY (:statuses) + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info + """ data = await fetch_data(query=query, values={'statuses': REDIRECTION_STATUSES}) return data # Endpoint to retrieve data with client error statuses @app.get('/Client_Error_URLs/4xx', response_model=List[StatusResponse]) async def get_client_error_statuses(): - query = "SELECT DISTINCT * FROM linkchecker_output WHERE valid = ANY(:statuses)" + query = """ + SELECT + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls + FROM + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE + l.status ILIKE ANY (:statuses) + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info + """ data = await fetch_data(query=query, values={'statuses': CLIENT_ERROR_STATUSES}) return data # Endpoint to retrieve data with server error statuses @app.get('/Server_Errors_URLs/5xx', response_model=List[StatusResponse]) async def get_server_error_statuses(): - query = "SELECT DISTINCT * FROM linkchecker_output WHERE valid = ANY(:statuses)" + query = """ + SELECT + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls + FROM + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE + l.status ILIKE ANY (:statuses) + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info + """ data = await fetch_data(query=query, values={'statuses': SERVER_ERROR_STATUSES}) return data -# Endpoint to retrieve data where the warning column is not empty -@app.get('/URLs_Which_Have_Warnings', response_model=List[StatusResponse]) -async def get_non_empty_warnings(): - query = "SELECT DISTINCT * FROM linkchecker_output WHERE warning != ''" - data = await fetch_data(query=query) - return data - # Endpoint to retrieve data with client error statuses @app.get('/status/{item:path}', response_model=List[StatusResponse]) async def get_status_for_url(item): - decoded_item = unquote(item) - query = "SELECT * FROM linkchecker_output WHERE urlname = :item" - data = await fetch_data(query=query, values={'item': decoded_item }) + query = """ + SELECT + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls + FROM + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE + l.urlname = :item + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info + """ + data = await fetch_data(query=query, values={'item': item}) return data # Endpoint to retrieve URLs that that timed out. Timeout is set to 5 seconds currently @app.get('/Timeout_URLs', response_model=List[StatusResponse]) async def get_timeout_urls(): query = """ - SELECT DISTINCT * - FROM linkchecker_output - WHERE valid LIKE '%ReadTimeout%' OR valid LIKE '%ConnectTimeout%' + SELECT + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls + FROM + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE + l.status LIKE '%ReadTimeout%' OR l.status LIKE '%ConnectTimeout%' + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info """ data = await fetch_data(query=query) return data -@app.get("/Single_url_status_history", response_model=List[URLAvailabilityResponse]) -async def get_current_url_status_history( - url: str = Query(..., description="URL to get avalability"), - limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]: +@app.get('/Deprecated URLs', response_model=List[StatusResponse]) +async def get_deprecated_urls(): query = """ - SELECT - lo.urlname AS url, - lo.parentname AS parent_url, - lo.result AS result, - lo.warning AS warning, - vh.validation_result AS validation_valid, - vh.timestamp AS last_checked - FROM - linkchecker_output lo - JOIN ( SELECT - url, - validation_result, - timestamp, - ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn + l.id_link AS id, + l.urlname, + l.status, + l.warning, + l.result, + l.info, + l.deprecated, + array_remove(array_agg(DISTINCT p.parentname), NULL) AS parent_urls FROM - validation_history - ) vh ON lo.urlname = vh.url AND vh.rn = 1 - WHERE (lo.urlname = :url) - LIMIT :limit + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + WHERE l.deprecated IS TRUE + GROUP BY + l.id_link, l.urlname, l.status, l.warning, result, info """ + data = await fetch_data(query=query) + return data - try: - results = await fetch_data(query=query, values={'url': url, 'limit': limit}) - logger.info(f"Query returned {len(results)} results.") - - response_data = [URLAvailabilityResponse(**dict(row)) for row in results] - - return response_data - except Exception as e: - logger.error(f"Error occurred: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=str(e)) - -@app.get("/All_url_status_history", response_model=List[URLAvailabilityResponse]) -async def get_all_url_status_history( - limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)")) -> List[URLAvailabilityResponse]: - +@app.get("/URL_status_history", response_model=List[URLAvailabilityResponse]) +async def get_url_status_history( + url: str = Query(..., description="URL to get availability history"), + limit: int = Query(100, ge=1, le=1000, description="Maximum number of results (default: 100, min: 1, max: 1000)") +) -> List[URLAvailabilityResponse]: query = """ - SELECT - lo.urlname AS url, - lo.parentname AS parent_url, - lo.result AS result, - lo.warning AS warning, - vh.validation_result AS validation_valid, - vh.timestamp AS last_checked - FROM - linkchecker_output lo - JOIN ( SELECT - url, - validation_result, - timestamp, - ROW_NUMBER() OVER (PARTITION BY url ORDER BY timestamp DESC) as rn + l.urlname, + l.status, + l.result, + l.info, + l.warning, + vh.validation_result AS validation_valid, + vh.timestamp AS last_checked, + array_agg(DISTINCT p.parentname) AS parent_urls FROM - validation_history - ) vh ON lo.urlname = vh.url AND vh.rn = 1 - ORDER BY - vh.timestamp DESC - LIMIT :limit + links l + LEFT JOIN + parent p ON l.id_link = p.fk_link + LEFT JOIN + validation_history vh ON l.id_link = vh.fk_link + WHERE + l.urlname = :url + GROUP BY + l.urlname, l.status, l.result, l.info, l.warning, vh.validation_result, vh.timestamp + ORDER BY + vh.timestamp DESC + LIMIT :limit """ - values = {"limit": limit} - try: - results = await fetch_data(query=query, values=values) - logging.info(f"Query returned {len(results)} results.") - - response_data = [URLAvailabilityResponse(**row) for row in results] - + results = await fetch_data(query=query, values={'url': url, 'limit': limit}) + logger.info(f"Query returned {len(results)} results for URL: {url}") + + response_data = [URLAvailabilityResponse(**dict(row)) for row in results] + return response_data except Exception as e: - logging.error(f"Error occurred: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=str(e)) - -@app.get('/Deprecated URLs', response_model=List[DeprecatedUrlsResponse]) -async def get_deprecated_urls(): - query = """ - SELECT - us.url AS url - FROM - url_status us - WHERE us.deprecated = TRUE - """ - try: - data = await fetch_data(query=query) - return data - except Exception as e: - logging.error(f"Error occurred: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=str(e)) + logger.error(f"Error occurred while fetching URL status history: {e}", exc_info=True) + raise HTTPException(status_code=500, detail="Failed to fetch URL status history") # Start the application @app.on_event('startup') diff --git a/src/linkchecker.py b/src/linkchecker.py index 8e1a7fc..77e0355 100644 --- a/src/linkchecker.py +++ b/src/linkchecker.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from dotenv import load_dotenv +from urllib.parse import urlparse, parse_qs, urlencode import subprocess import psycopg2 import psycopg2.extras @@ -10,7 +11,7 @@ import os # When a URL reaches MAX_FAILURES consecutive failures it's marked -# as deprecated and excluded from future checks +# as deprecated and excluded from future insertions in database MAX_FAILURES = 10 # Load environment variables from .env file @@ -32,45 +33,48 @@ def setup_database(): password=os.environ.get("POSTGRES_PASSWORD") ) cur = conn.cursor() + + # Drop tables (only for development purposes) + # cur.execute("DROP TABLE IF EXISTS validation_history CASCADE") + # cur.execute("DROP TABLE IF EXISTS parent CASCADE") + # cur.execute("DROP TABLE IF EXISTS links CASCADE") # Create or truncate linkchecker_output table - cur.execute("DROP TABLE IF EXISTS linkchecker_output") create_table_query = """ - CREATE TABLE linkchecker_output ( - id SERIAL PRIMARY KEY, - urlname TEXT, - parentname TEXT, - baseref TEXT, - valid TEXT, + CREATE TABLE IF NOT EXISTS links ( + id_link SERIAL PRIMARY KEY, + urlname TEXT UNIQUE, + status TEXT, result TEXT, - warning TEXT, info TEXT, - url TEXT, - name TEXT + warning TEXT, + deprecated BOOLEAN DEFAULT FALSE, + consecutive_failures INTEGER DEFAULT 0 ) """ cur.execute(create_table_query) - + # Create validation_history table if it doesn't exist cur.execute(""" - CREATE TABLE IF NOT EXISTS validation_history ( + CREATE TABLE IF NOT EXISTS parent ( id SERIAL PRIMARY KEY, - url TEXT NOT NULL, - validation_result TEXT NOT NULL, - timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP + parentname TEXT NULL, + baseref TEXT NULL, + fk_link INTEGER REFERENCES links(id_link), + UNIQUE (parentname, baseref, fk_link) ) """) - + # Create url_status table if it doesn't exist cur.execute(""" - CREATE TABLE IF NOT EXISTS url_status ( - url TEXT PRIMARY KEY, - consecutive_failures INTEGER DEFAULT 0, - deprecated BOOLEAN DEFAULT FALSE, - last_checked TIMESTAMP + CREATE TABLE IF NOT EXISTS validation_history ( + id SERIAL PRIMARY KEY, + fk_link INTEGER REFERENCES links(id_link), + validation_result TEXT NOT NULL, + timestamp TIMESTAMP NOT NULL ) """) - + conn.commit() return conn, cur @@ -85,7 +89,7 @@ def get_pagination_info(url): number_matched = data.get('numberMatched', 0) number_returned = data.get('numberReturned', 0) - # Calculate total pages + # Calculate total pages total_pages = math.ceil(number_matched / number_returned) return total_pages, number_returned except requests.exceptions.RequestException as e: @@ -116,63 +120,108 @@ def extract_links(url): print(f"Error extracting links from {url}: {e}") return [] -def run_linkchecker(urls): - for url in urls: - # Run LinkChecker Docker command with specified user and group IDs for each URL - process = subprocess.Popen([ - "linkchecker", - "--verbose", - "--check-extern", - "--recursion-level=1", - "--timeout=5", - "--output=csv", - url + "?f=html" - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) +def check_single_url(url): + process = subprocess.Popen([ + "linkchecker", + "--verbose", + "--check-extern", + "--recursion-level=0", + "--timeout=5", + "--output=csv", + url + "?f=html" + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # Process the output line by line and yield each line - for line in process.stdout: - yield line.decode('utf-8').strip() # Decode bytes to string and strip newline characters - # Wait for the process to finish - process.wait() + # Process.communicate is good for shorter-running processes + stdout, _ = process.communicate() -def insert_validation_history(conn, url, validation_result, is_valid): - with conn.cursor() as cur: - # Insert new record in validation_history - cur.execute(""" - INSERT INTO validation_history (url, validation_result) - VALUES (%s, %s) - """, (url, validation_result)) + return stdout.decode('utf-8').strip().split('\n') + +def run_linkchecker(url): + # Run LinkChecker Docker command with specified user and group IDs for each URL + process = subprocess.Popen([ + "linkchecker", + "--verbose", + "--check-extern", + "--recursion-level=1", + "--timeout=5", + "--output=csv", + url + "?f=html" + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # Process the output line by line and yield each line + # Memory efficient for large outputs + for line in process.stdout: + yield line.decode('utf-8').strip() # Decode bytes to string and strip newline characters + # Wait for the process to finish + process.wait() + +def insert_or_update_link(conn, urlname, status, result, info, warning, is_valid): + + with conn.cursor() as cur: # Get current status - cur.execute("SELECT consecutive_failures, deprecated FROM url_status WHERE url = %s", (url,)) - result = cur.fetchone() + cur.execute("SELECT id_link, consecutive_failures, deprecated FROM links WHERE urlname = %s", (urlname,)) + existing_link = cur.fetchone() + + if existing_link: + link_id, consecutive_failures, deprecated = existing_link - if result: - consecutive_failures, deprecated = result + if existing_link[2]: + # Ignore deprecated URL's + # Deprecated URL's are these urls that consecutive have failed for MAX_FAILURES times + return None + if not is_valid: consecutive_failures += 1 else: consecutive_failures = 0 - + deprecated = deprecated or (consecutive_failures >= MAX_FAILURES) - - # Update url_status + + # Updade existing link cur.execute(""" - UPDATE url_status - SET consecutive_failures = %s, - deprecated = %s, - last_checked = CURRENT_TIMESTAMP - WHERE url = %s - """, (consecutive_failures, deprecated, url)) + UPDATE links SET + status = %s, + result = %s, + info = %s, + warning = %s, + deprecated = %s, + consecutive_failures = %s + WHERE id_link = %s + """,(status, result, info, warning, deprecated, consecutive_failures, link_id)) else: - # Insert new url_status if not exists + # Insert new link (not deprecated on the first insertion) cur.execute(""" - INSERT INTO url_status (url, consecutive_failures, deprecated, last_checked) - VALUES (%s, %s, %s, CURRENT_TIMESTAMP) - """, (url, 0 if is_valid else 1, False)) + INSERT INTO links (urlname, status, result, info, warning, deprecated, consecutive_failures) + VALUES (%s, %s, %s, %s, %s, %s, %s) + RETURNING id_link + """, (urlname, status, result, info, warning, False, 0 if is_valid else 1)) + + link_id = cur.fetchone()[0] + + # Insert new record in validation history + cur.execute(""" + INSERT INTO validation_history(fk_link, validation_result, timestamp) + VALUES(%s, %s, CURRENT_TIMESTAMP) + """,(link_id, status)) + conn.commit() + + return link_id - conn.commit() - +def insert_parent(conn, parentname, baseref, link_id): + with conn.cursor() as cur: + # Convert empty strings to None + parentname = parentname if parentname else None + baseref = baseref if baseref else None + + cur.execute(""" + INSERT INTO parent (parentname, baseref, fk_link) + VALUES (%s, %s, %s) + ON CONFLICT (parentname, baseref, fk_link) DO NOTHING + """, (parentname, baseref, link_id)) + + # Commit the transaction + conn.commit() + def is_valid_status(valid_string): # Return if status is valid or not parts = valid_string.split() @@ -180,37 +229,55 @@ def is_valid_status(valid_string): if 200 <= int(parts[0]) < 400: # Valid HTTP status codes range return True return False - + def get_active_urls(conn): with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM validation_history") count = cur.fetchone()[0] - + if count == 0: return None # The table is empty else: cur.execute("SELECT url FROM validation_history WHERE NOT deprecated") return [row[0] for row in cur.fetchall()] -def get_all_urls(conn): - with conn.cursor() as cur: - cur.execute("SELECT COUNT(*) FROM validation_history") - count = cur.fetchone()[0] +def determine_service_type(url): + ogc_patterns = ['/wms', '/wfs', '/csw', '/wcs', 'service='] + + if any(pattern in url.lower() for pattern in ogc_patterns): + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) - if count == 0: - return None # The table is empty - else: - cur.execute("SELECT url FROM validation_history") - return [row[0] for row in cur.fetchall()] + query_params.pop('service', None) + query_params.pop('request', None) + + query_params['request'] = ['GetCapabilities'] + + if 'service' not in query_params: + if '/wms' in parsed_url.path.lower(): + query_params['service'] = ['WMS'] + elif '/wfs' in parsed_url.path.lower(): + query_params['service'] = ['WFS'] + elif '/csw' in parsed_url.path.lower(): + query_params['service'] = ['CSW'] + elif '/wcs' in parsed_url.path.lower(): + query_params['service'] = ['WCS'] + + new_query = urlencode(query_params, doseq=True) + new_url = parsed_url._replace(query=new_query).geturl() + + return new_url + + return url def main(): start_time = time.time() # Start timing # Set up the database and create the table print("Setting PostgreSQL db") conn, cur = setup_database() - - print("Time started processing links.") - print("Loading EJP SOIL Catalogue links...") + + print('Time started processing links.') + print(f'Loading {catalogue_json_url} links...') total_pages, numbers_returned = get_pagination_info(catalogue_json_url) # Base URL @@ -225,48 +292,48 @@ def main(): for url in urls: extracted_links = extract_links(url) all_links.update(extracted_links) # Add new links to the set of all links - - # Define the formats to be removed - formats_to_remove = [ - 'collections/' + collection + '/items?offset', - '?f=json' - ] - - # Get the list of active (non-deprecated) URLs - all_known_urls = get_all_urls(conn) - - if all_known_urls is None: - # First run on empty table, check all links - links_to_check = all_links - else: - # Check all known links plus any new links - links_to_check = set(all_known_urls) | all_links - + # Specify the fields to include in the CSV file - fields_to_include = ['urlname', 'parentname', 'baseref', 'valid', 'result', 'warning', 'info', 'url', 'name'] + fields_to_include = ['urlname', 'parentname', 'baseref', 'valid', 'result', 'warning', 'info'] print("Checking Links...") + # Run LinkChecker and process the output - for line in run_linkchecker(links_to_check): - if re.match(r'^http', line): - # Remove trailing semicolon and split by semicolon - values = line.rstrip(';').split(';') - filtered_values = [values[field] if field < len(values) else "" for field in range(len(fields_to_include))] - - is_valid = False - if is_valid_status(filtered_values[3]): - is_valid = True - # Insert the data into the PostgreSQL table for each link - insert_query = """ - INSERT INTO linkchecker_output - (urlname, parentname, baseref, valid, result, warning, info, url, name) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - cur.execute(insert_query, filtered_values) - conn.commit() - - insert_validation_history(conn, filtered_values[0], filtered_values[3], is_valid) + urls_to_recheck = set() + print("Initial Link Checking...") + for url in all_links: + for line in run_linkchecker(url): + if re.match(r'^http', line): + values = line.rstrip(';').split(';') + urlname = values[0] + + # Parse initial check results + filtered_values = [str(values[i]) if i < len(values) else "" for i in range(len(fields_to_include))] + urlname, parentname, baseref, valid, result, warning, info = filtered_values + + # Determine if URL needs to be rechecked + processed_url = determine_service_type(urlname) + if processed_url != urlname: + urls_to_recheck.add(processed_url) + else: + # If URL doesn't need reprocessing, insert results directly + is_valid = is_valid_status(valid) + link_id = insert_or_update_link(conn, urlname, valid, result, info, warning, is_valid) + insert_parent(conn, parentname, baseref, link_id) + print("Rechecking OGC processed URLs...") + for url in urls_to_recheck: + results = check_single_url(url) + for line in results: + if re.match(r'^http', line): + values = line.rstrip(';').split(';') + filtered_values = [str(values[i]) if i < len(values) else "" for i in range(len(fields_to_include))] + urlname, parentname, baseref, valid, result, warning, info = filtered_values + is_valid = is_valid_status(valid) + link_id = insert_or_update_link(conn, urlname, valid, result, info, warning, is_valid) + insert_parent(conn, parentname, baseref, link_id) + + # conn.commit() print("LinkChecker output written to PostgreSQL database") # Close the connection and cursor @@ -274,9 +341,8 @@ def main(): conn.close() end_time = time.time() - elapsed_time = end_time - start_time + elapsed_time = end_time - start_time print(f"Time elapsed: {elapsed_time:.2f} seconds") if __name__ == "__main__": main() -