diff --git a/apps/webcrawler/webcrawler.py b/apps/webcrawler/webcrawler.py index 953cfc2..8e74623 100644 --- a/apps/webcrawler/webcrawler.py +++ b/apps/webcrawler/webcrawler.py @@ -24,7 +24,7 @@ def load_web(): logger.info("load_web") - print(f"Loading of webpages started") + print("Loading of webpages started") # Initialize urls to parse urls = [ "https://www.startmunich.de", @@ -46,14 +46,14 @@ def load_web(): # Process raw HTML docs to text docs html2text = Html2TextTransformer() docs = html2text.transform_documents(docs) - print(f"Loading of webpages finished") + print("Loading of webpages finished") logger.info("load_web finished") return docs def write_db(): logger.info(f"start writing web_data") - print(f"Start writing web_data to neo4j") + print("Start writing web_data to neo4j") docs = load_web() # write each page into the neo4J database using the url as the id @@ -78,7 +78,7 @@ def write_db(): content=page.page_content ) logger.info(f"page: " + page.metadata.get("source") + " added to neo4j") - print(f"page: " + page.metadata.get("source") + " added to neo4j") + print("page: " + page.metadata.get("source") + " added to neo4j") # send post request to '/enqueue_web' endpoint with web_ids # to add the webpage to the queue @@ -86,7 +86,7 @@ def write_db(): requests.post("http://vectordb_sync:5000/enqueue_web", json={"ids": web_ids}) logger.info(f"adding web_ids: to queue") - print(f"adding web_ids: to queue") + print("adding web_ids: to queue") if __name__ == '__main__':