From 04b0d20c784fa301756e46ca22d731eb1771a7a3 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 16 Dec 2024 16:26:26 -0500
Subject: [PATCH 1/2] add support for nvidia/llama-3.2-nv-embedqa-1b-v2 and
 nvidia/llama-3.2-nv-rerankqa-1b-v2

---
 .../langchain_nvidia_ai_endpoints/_statics.py         | 11 +++++++++++
 .../tests/integration_tests/test_ranking.py           |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
index 916b6d2e..cf111e49 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
@@ -624,6 +624,11 @@ def validate_client(self) -> "Model":
         model_type="embedding",
         client="NVIDIAEmbeddings",
     ),
+    "nvidia/llama-3.2-nv-embedqa-1b-v2": Model(
+        id="nvidia/llama-3.2-nv-embedqa-1b-v2",
+        model_type="embedding",
+        client="NVIDIAEmbeddings",
+    ),
 }
 
 RANKING_MODEL_TABLE = {
@@ -646,6 +651,12 @@ def validate_client(self) -> "Model":
         client="NVIDIARerank",
         endpoint="https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v1/reranking",
     ),
+    "nvidia/llama-3.2-nv-rerankqa-1b-v2": Model(
+        id="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+        model_type="ranking",
+        client="NVIDIARerank",
+        endpoint="https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking",
+    ),
 }
 
 COMPLETION_MODEL_TABLE = {
diff --git a/libs/ai-endpoints/tests/integration_tests/test_ranking.py b/libs/ai-endpoints/tests/integration_tests/test_ranking.py
index 47fc8438..d067a1e6 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_ranking.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_ranking.py
@@ -202,7 +202,7 @@ def test_truncate_negative(rerank_model: str, mode: dict, truncate: str) -> None
     query = "What is acceleration?"
     documents = [
         Document(page_content="NVIDIA " * length)
-        for length in [32, 1024, 64, 128, 2048, 256, 512]
+        for length in [32, 1024, 64, 128, 10240, 256, 512]
     ]
     truncate_param = {}
     if truncate:

From 7fd810927912d086b539401d5fd1dea4b8b18d37 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 16 Dec 2024 16:37:02 -0500
Subject: [PATCH 2/2] fix lint

---
 .../ai-endpoints/tests/integration_tests/test_chat_models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libs/ai-endpoints/tests/integration_tests/test_chat_models.py b/libs/ai-endpoints/tests/integration_tests/test_chat_models.py
index ac657085..d81c9e19 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_chat_models.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_chat_models.py
@@ -236,6 +236,7 @@ def test_ai_endpoints_invoke_max_tokens_negative_a(
     with pytest.raises(Exception):
         llm = ChatNVIDIA(model=chat_model, max_tokens=max_tokens, **mode)
         llm.invoke("Show me the tokens")
+    assert llm._client.last_response is not None
     assert llm._client.last_response.status_code in [400, 422]
     assert "max_tokens" in str(llm._client.last_response.content)
 
@@ -250,6 +251,7 @@ def test_ai_endpoints_invoke_max_tokens_negative_b(
     with pytest.raises(Exception):
         llm = ChatNVIDIA(model=chat_model, max_tokens=max_tokens, **mode)
         llm.invoke("Show me the tokens")
+    assert llm._client.last_response is not None
     assert llm._client.last_response.status_code in [400, 422]
     # custom error string -
     #    model inference failed -- ValueError: A requested length of the model output
@@ -306,6 +308,7 @@ def test_ai_endpoints_invoke_seed_default(chat_model: str, mode: dict) -> None:
 def test_ai_endpoints_invoke_seed_range(chat_model: str, mode: dict, seed: int) -> None:
     llm = ChatNVIDIA(model=chat_model, seed=seed, **mode)
     llm.invoke("What's in a seed?")
+    assert llm._client.last_response is not None
     assert llm._client.last_response.status_code == 200
 
 
@@ -332,6 +335,7 @@ def test_ai_endpoints_invoke_temperature_negative(
     with pytest.raises(Exception):
         llm = ChatNVIDIA(model=chat_model, temperature=temperature, **mode)
         llm.invoke("What's in a temperature?")
+    assert llm._client.last_response is not None
     assert llm._client.last_response.status_code in [400, 422]
     assert "temperature" in str(llm._client.last_response.content)
 
@@ -360,6 +364,7 @@ def test_ai_endpoints_invoke_top_p_negative(
     with pytest.raises(Exception):
         llm = ChatNVIDIA(model=chat_model, top_p=top_p, **mode)
         llm.invoke("What's in a top_p?")
+    assert llm._client.last_response is not None
     assert llm._client.last_response.status_code in [400, 422]
     assert "top_p" in str(llm._client.last_response.content)