From 6293ee33c6733ebf8ee6d89c435976941d7b2922 Mon Sep 17 00:00:00 2001 From: Dias Kalkamanov Date: Mon, 12 Feb 2024 12:22:20 +0600 Subject: [PATCH] remove links from telegram text messages --- llama_hub/telegram/base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/llama_hub/telegram/base.py b/llama_hub/telegram/base.py index 314850d2c4..0202601057 100644 --- a/llama_hub/telegram/base.py +++ b/llama_hub/telegram/base.py @@ -1,5 +1,6 @@ """Telegram reader that reads posts/chats and comments to post from Telegram channel or chat.""" import asyncio +import re from typing import List, Union from llama_index.readers.base import BaseReader @@ -102,5 +103,15 @@ async def _load_data( entity_name, reply_to=post_id, limit=limit ): if isinstance(message.text, str) and message.text != "": - results.append(Document(text=message.text)) + results.append(Document(text=self._remove_links(message.text))) return results + + def _remove_links(self, string) -> str: + """Removes all URLs from a given string, leaving only the base domain name.""" + + def replace_match(match): + text = match.group(1) + return text if text else "" + + url_pattern = r"https?://(?:www\.)?((?!www\.).)+?" + return re.sub(url_pattern, replace_match, string)