Skip to content

Commit

Permalink
Merge pull request #127 from fractalego/indexing-with-large-corpus
Browse files Browse the repository at this point in the history
Indexing with large corpus
  • Loading branch information
fractalego authored Jul 24, 2024
2 parents 623ebed + e245faf commit 09df1e3
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 11 deletions.
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
flask[async]==3.0.3
flask-cors==4.0.1
nltk==3.8.1
gensim==4.3.3
sklearn==0.0
python-Levenshtein==0.25.1
fuzzywuzzy==0.18.0
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
"flask[async]==3.0.3",
"flask-cors==4.0.1",
"nltk==3.8.1",
"gensim==4.3.3",
"sklearn==0.0",
"python-Levenshtein==0.25.1",
"fuzzywuzzy==0.18.0",
Expand Down
1 change: 1 addition & 0 deletions todo.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
* why do I need to re-initialise the retrievers after unpickling the knowledge?
- maybe you should save the retrievers in the knowledge object separately?

/* knowledge cache does not cache the rules or facts

Expand Down
3 changes: 1 addition & 2 deletions wafl/knowledge/indexing_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,12 @@ async def load_knowledge(config, logger=None):
if knowledge.hash == hash(rules_txt) and os.path.getmtime(
cache_filename
) > os.path.getmtime(index_filename):
await knowledge.initialize_retrievers()
return knowledge

knowledge = SingleFileKnowledge(config, rules_txt, logger=logger)
knowledge = await _add_indices_to_knowledge(knowledge, index_txt)
joblib.dump(knowledge, config.get_value("cache_filename"))
await knowledge.initialize_retrievers()
joblib.dump(knowledge, config.get_value("cache_filename"))
return knowledge


Expand Down
17 changes: 11 additions & 6 deletions wafl/retriever/dense_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import numpy as np

from typing import List, Tuple
from gensim.models import KeyedVectors
from wafl.connectors.factories.sentence_embedder_connector_factory import (
SentenceEmbedderConnectorFactory,
)
Expand All @@ -18,18 +17,24 @@ def __init__(self, model_name, config):
self._connector = SentenceEmbedderConnectorFactory.get_connector(
model_name, config
)
self._embeddings_model = KeyedVectors(384)
self._matrix = np.zeros((0, 384))
self._indices = []

async def add_text_and_index(self, text: str, index: str):
embeddings = await self._get_embeddings_from_text(text)
self._embeddings_model.add_vectors([index], [embeddings])
self._embeddings_model.fill_norms(force=True)
self._matrix = np.vstack([self._matrix, embeddings])
self._indices.append(index)

async def get_indices_and_scores_from_text(
self, text: str
self, text: str, topn: int = 5
) -> List[Tuple[str, float]]:
embeddings = await self._get_embeddings_from_text(text)
return self._embeddings_model.similar_by_vector(embeddings, topn=5)
scores = np.dot(self._matrix, embeddings) / (
np.linalg.norm(self._matrix, axis=1) * np.linalg.norm(embeddings)
)
indices_and_scores = list(zip(self._indices, scores))
indices_and_scores.sort(key=lambda x: x[1], reverse=True)
return indices_and_scores[:topn]

async def _get_embeddings_from_text(self, text: str) -> "numpy.array":
return (await self._connector.predict(text))["embedding"]
Expand Down
2 changes: 1 addition & 1 deletion wafl/variables.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
def get_variables():
return {
"version": "0.1.1",
"version": "0.1.2",
}


Expand Down

0 comments on commit 09df1e3

Please sign in to comment.