Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Remove links from telegram text messages #943

Merged
merged 1 commit into from
Feb 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion llama_hub/telegram/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Telegram reader that reads posts/chats and comments to post from Telegram channel or chat."""
import asyncio
import re
from typing import List, Union

from llama_index.readers.base import BaseReader
Expand Down Expand Up @@ -102,5 +103,15 @@ async def _load_data(
entity_name, reply_to=post_id, limit=limit
):
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=message.text))
results.append(Document(text=self._remove_links(message.text)))
return results

def _remove_links(self, string) -> str:
"""Removes all URLs from a given string, leaving only the base domain name."""

def replace_match(match):
text = match.group(1)
return text if text else ""

url_pattern = r"https?://(?:www\.)?((?!www\.).)+?"
return re.sub(url_pattern, replace_match, string)
Loading