From 67686c11858b5520696ea490f826fd2ff7a0fb69 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 14:56:23 -0800 Subject: [PATCH 01/16] Ensure fields are lists before combining in DocDetails class --- paperqa/types.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/paperqa/types.py b/paperqa/types.py index 5f3f50ff..9d4107ce 100644 --- a/paperqa/types.py +++ b/paperqa/types.py @@ -686,6 +686,16 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912 merged_data[field] = {**self.other, **other.other} # handle the bibtex / sources as special fields for field_to_combine in ("bibtex_source", "client_source"): + # Ensure the fields are lists before combining + if self.other.get(field_to_combine) and not isinstance( + self.other[field_to_combine], list + ): + self.other[field_to_combine] = [self.other[field_to_combine]] + if other.other.get(field_to_combine) and not isinstance( + other.other[field_to_combine], list + ): + other.other[field_to_combine] = [other.other[field_to_combine]] + if self.other.get(field_to_combine) and other.other.get( field_to_combine ): From 1505d5d6301223dc376531daf06487852b60bd6a Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 15:02:53 -0800 Subject: [PATCH 02/16] Update .mailmap to include new author entries for Harry Vu --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 3c2ce6ea..49a287e0 100644 --- a/.mailmap +++ b/.mailmap @@ -8,3 +8,5 @@ Odhran O'Donoghue <39832722+odhran-o-d@users.nore Samantha Cox Anush008 Anush Mayk Caldas maykcaldas +Harry Vu +Harry Vu harryvu-futurehouse From 73d86a23c8fa1c77e3383ad131f68b1c2d461723 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 16:03:15 -0800 Subject: [PATCH 03/16] Add test for merging DocDetails to ensure list format consistency --- tests/test_task.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_task.py b/tests/test_task.py index af845c66..bf0d1962 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -1,6 +1,7 @@ import asyncio from collections.abc import Iterable from copy import deepcopy +from datetime import datetime from unittest.mock import patch import pytest @@ -22,6 +23,7 @@ ) from paperqa.agents.tools import GenerateAnswer from paperqa.litqa import DEFAULT_REWARD_MAPPING +from paperqa.types import DocDetails @pytest.fixture(name="base_query_request") @@ -276,3 +278,32 @@ async def test_tool_failure(self, base_query_request: QueryRequest) -> None: mock_SearchIndex.assert_called(), "Expected failures to come from unit test" assert metrics_callback.eval_means["correct"] == 0.0 assert metrics_callback.eval_means["correct_unsure"] == 0.0 + + +@pytest.mark.asyncio +async def test_docdetails_add(): + # Create two DocDetails instances + doc1 = DocDetails( + citation="Citation 1", + publication_date=datetime(2023, 1, 1), + docname="Document 1", + dockey="key1", + other={"bibtex_source": "source1", "client_source": ["client1"]}, + ) + + doc2 = DocDetails( + citation="Citation 2", + publication_date=datetime(2024, 1, 1), + docname="Document 2", + dockey="key2", + other={"bibtex_source": ["source2"], "client_source": "client2"}, + ) + + # Merge the two DocDetails instances + merged_doc = doc1 + doc2 + + # use set operations to check for containment + assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) + assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) + # Check that the merged_doc is an instance of DocDetails + assert isinstance(merged_doc, DocDetails) From c9571243863be204f02964fe1f021344da7c66ca Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 16:09:53 -0800 Subject: [PATCH 04/16] add more test for `DocDetails` `__add__` --- tests/test_task.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_task.py b/tests/test_task.py index bf0d1962..a6f893e2 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -307,3 +307,28 @@ async def test_docdetails_add(): assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) # Check that the merged_doc is an instance of DocDetails assert isinstance(merged_doc, DocDetails) + + +@pytest.mark.asyncio +def test_docdetails_merge_with_list_fields(): + doc1 = DocDetails( + citation="Citation 1", + publication_date=datetime(2023, 1, 1), + other={"bibtex_source": ["source1"], "client_source": ["client1"]}, + ) + + doc2 = DocDetails( + citation="Citation 2", + publication_date=datetime(2024, 1, 1), + other={"bibtex_source": ["source2"], "client_source": ["client2"]}, + ) + + # Merge the two DocDetails instances + merged_doc = doc1 + doc2 + + # Check that 'other' fields are merged correctly + assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) + assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) + + # Check that the merged_doc is an instance of DocDetails + assert isinstance(merged_doc, DocDetails) From b3505a5f0003113264e7f7c7eebf62de391feb59 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 16:10:49 -0800 Subject: [PATCH 05/16] add more test for `DocDetails` `__add__` --- tests/test_task.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_task.py b/tests/test_task.py index a6f893e2..b9f30db9 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -314,12 +314,16 @@ def test_docdetails_merge_with_list_fields(): doc1 = DocDetails( citation="Citation 1", publication_date=datetime(2023, 1, 1), + docname="Document 1", + dockey="key1", other={"bibtex_source": ["source1"], "client_source": ["client1"]}, ) doc2 = DocDetails( citation="Citation 2", publication_date=datetime(2024, 1, 1), + docname="Document 2", + dockey="key2", other={"bibtex_source": ["source2"], "client_source": ["client2"]}, ) From 8d078442d11481b98a808b73bd1f9b2bb565442a Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 16:16:40 -0800 Subject: [PATCH 06/16] rename test for better explanation --- tests/test_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_task.py b/tests/test_task.py index b9f30db9..24693754 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -281,7 +281,7 @@ async def test_tool_failure(self, base_query_request: QueryRequest) -> None: @pytest.mark.asyncio -async def test_docdetails_add(): +async def test_docdetails_merge_with_non_list_fields(): # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", From 0dfa22229e93cc59d4b55dcefcb2f3d2b7d21504 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 17:42:33 -0800 Subject: [PATCH 07/16] move tests for docdetails merge to `test_paper.py` --- tests/test_paperqa.py | 58 +++++++++++++++++++++++++++++++++++++++++++ tests/test_task.py | 58 ------------------------------------------- 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index fd5bbba4..fe26f836 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -8,6 +8,7 @@ from copy import deepcopy from io import BytesIO from pathlib import Path +from datetime import datetime from typing import cast import httpx @@ -1238,6 +1239,63 @@ def test_dois_resolve_to_correct_journals(doi_journals): details = DocDetails(doi=doi_journals["doi"]) # type: ignore[call-arg] assert details.journal == doi_journals["journal"] +@pytest.mark.asyncio +async def test_docdetails_merge_with_non_list_fields(): + # Create two DocDetails instances + doc1 = DocDetails( + citation="Citation 1", + publication_date=datetime(2023, 1, 1), + docname="Document 1", + dockey="key1", + other={"bibtex_source": "source1", "client_source": ["client1"]}, + ) + + doc2 = DocDetails( + citation="Citation 2", + publication_date=datetime(2024, 1, 1), + docname="Document 2", + dockey="key2", + other={"bibtex_source": ["source2"], "client_source": "client2"}, + ) + + # Merge the two DocDetails instances + merged_doc = doc1 + doc2 + + # use set operations to check for containment + assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) + assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) + # Check that the merged_doc is an instance of DocDetails + assert isinstance(merged_doc, DocDetails) + + +@pytest.mark.asyncio +def test_docdetails_merge_with_list_fields(): + doc1 = DocDetails( + citation="Citation 1", + publication_date=datetime(2023, 1, 1), + docname="Document 1", + dockey="key1", + other={"bibtex_source": ["source1"], "client_source": ["client1"]}, + ) + + doc2 = DocDetails( + citation="Citation 2", + publication_date=datetime(2024, 1, 1), + docname="Document 2", + dockey="key2", + other={"bibtex_source": ["source2"], "client_source": ["client2"]}, + ) + + # Merge the two DocDetails instances + merged_doc = doc1 + doc2 + + # Check that 'other' fields are merged correctly + assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) + assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) + + # Check that the merged_doc is an instance of DocDetails + assert isinstance(merged_doc, DocDetails) + @pytest.mark.vcr @pytest.mark.parametrize("use_partition", [True, False]) diff --git a/tests/test_task.py b/tests/test_task.py index fc1ffda9..d7795ce4 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -283,61 +283,3 @@ async def test_tool_failure(self, base_query_request: QueryRequest) -> None: mock_SearchIndex.assert_called(), "Expected failures to come from unit test" assert metrics_callback.eval_means["correct"] == 0.0 assert metrics_callback.eval_means["correct_unsure"] == 0.0 - - -@pytest.mark.asyncio -async def test_docdetails_merge_with_non_list_fields(): - # Create two DocDetails instances - doc1 = DocDetails( - citation="Citation 1", - publication_date=datetime(2023, 1, 1), - docname="Document 1", - dockey="key1", - other={"bibtex_source": "source1", "client_source": ["client1"]}, - ) - - doc2 = DocDetails( - citation="Citation 2", - publication_date=datetime(2024, 1, 1), - docname="Document 2", - dockey="key2", - other={"bibtex_source": ["source2"], "client_source": "client2"}, - ) - - # Merge the two DocDetails instances - merged_doc = doc1 + doc2 - - # use set operations to check for containment - assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) - assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) - # Check that the merged_doc is an instance of DocDetails - assert isinstance(merged_doc, DocDetails) - - -@pytest.mark.asyncio -def test_docdetails_merge_with_list_fields(): - doc1 = DocDetails( - citation="Citation 1", - publication_date=datetime(2023, 1, 1), - docname="Document 1", - dockey="key1", - other={"bibtex_source": ["source1"], "client_source": ["client1"]}, - ) - - doc2 = DocDetails( - citation="Citation 2", - publication_date=datetime(2024, 1, 1), - docname="Document 2", - dockey="key2", - other={"bibtex_source": ["source2"], "client_source": ["client2"]}, - ) - - # Merge the two DocDetails instances - merged_doc = doc1 + doc2 - - # Check that 'other' fields are merged correctly - assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) - assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) - - # Check that the merged_doc is an instance of DocDetails - assert isinstance(merged_doc, DocDetails) From b5f716698e00449883ef6405074a5198f7998d3a Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 17:54:21 -0800 Subject: [PATCH 08/16] move tests for merging DocDetails to `test_paperqa.py` --- tests/test_paperqa.py | 3 ++- tests/test_task.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index fe26f836..6e75057b 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -6,9 +6,9 @@ import textwrap from collections.abc import AsyncIterable, Sequence from copy import deepcopy +from datetime import datetime from io import BytesIO from pathlib import Path -from datetime import datetime from typing import cast import httpx @@ -1239,6 +1239,7 @@ def test_dois_resolve_to_correct_journals(doi_journals): details = DocDetails(doi=doi_journals["doi"]) # type: ignore[call-arg] assert details.journal == doi_journals["journal"] + @pytest.mark.asyncio async def test_docdetails_merge_with_non_list_fields(): # Create two DocDetails instances diff --git a/tests/test_task.py b/tests/test_task.py index d7795ce4..523fd946 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -23,7 +23,6 @@ ) from paperqa.agents.tools import GenerateAnswer from paperqa.litqa import DEFAULT_REWARD_MAPPING -from paperqa.types import DocDetails @pytest.fixture(name="base_query_request") From bda482c8ed62e74669e56e8a106c46fd9bb1ba25 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 18:04:59 -0800 Subject: [PATCH 09/16] adding comments to explain test cases --- tests/test_paperqa.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 6e75057b..38e27885 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1242,6 +1242,8 @@ def test_dois_resolve_to_correct_journals(doi_journals): @pytest.mark.asyncio async def test_docdetails_merge_with_non_list_fields(): + # Test merging two DocDetails that were republished + # (so they have a new publication date but represent the same document) # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", @@ -1252,9 +1254,9 @@ async def test_docdetails_merge_with_non_list_fields(): ) doc2 = DocDetails( - citation="Citation 2", + citation="Citation 1", publication_date=datetime(2024, 1, 1), - docname="Document 2", + docname="Document 1", dockey="key2", other={"bibtex_source": ["source2"], "client_source": "client2"}, ) @@ -1271,6 +1273,8 @@ async def test_docdetails_merge_with_non_list_fields(): @pytest.mark.asyncio def test_docdetails_merge_with_list_fields(): + # Test merging two DocDetails that were republished + # (so they have a new publication date but represent the same document) doc1 = DocDetails( citation="Citation 1", publication_date=datetime(2023, 1, 1), @@ -1280,9 +1284,9 @@ def test_docdetails_merge_with_list_fields(): ) doc2 = DocDetails( - citation="Citation 2", + citation="Citation 1", publication_date=datetime(2024, 1, 1), - docname="Document 2", + docname="Document 1", dockey="key2", other={"bibtex_source": ["source2"], "client_source": ["client2"]}, ) From 983da666906ad44d5f4e06e91095c432136008d3 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Thu, 9 Jan 2025 18:09:16 -0800 Subject: [PATCH 10/16] clarifying comments --- tests/test_paperqa.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 38e27885..9f13fa86 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1242,8 +1242,8 @@ def test_dois_resolve_to_correct_journals(doi_journals): @pytest.mark.asyncio async def test_docdetails_merge_with_non_list_fields(): - # Test merging two DocDetails that were republished - # (so they have a new publication date but represent the same document) + # Test merging a DocDetail that was republished + # (so it has a new publication date but represents the same document) # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", @@ -1273,8 +1273,9 @@ async def test_docdetails_merge_with_non_list_fields(): @pytest.mark.asyncio def test_docdetails_merge_with_list_fields(): - # Test merging two DocDetails that were republished - # (so they have a new publication date but represent the same document) + # Test merging a DocDetail that was republished + # (so it has a new publication date but represents the same document) + # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", publication_date=datetime(2023, 1, 1), From d27da1afd7015a43f48d8b5eb77e9bd5c40b704b Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 08:54:34 -0800 Subject: [PATCH 11/16] make code more self-documenting --- tests/test_paperqa.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 9f13fa86..3aac50af 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1243,8 +1243,6 @@ def test_dois_resolve_to_correct_journals(doi_journals): @pytest.mark.asyncio async def test_docdetails_merge_with_non_list_fields(): # Test merging a DocDetail that was republished - # (so it has a new publication date but represents the same document) - # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", publication_date=datetime(2023, 1, 1), @@ -1254,9 +1252,9 @@ async def test_docdetails_merge_with_non_list_fields(): ) doc2 = DocDetails( - citation="Citation 1", + citation=doc1.citation, publication_date=datetime(2024, 1, 1), - docname="Document 1", + docname=doc1.docname, dockey="key2", other={"bibtex_source": ["source2"], "client_source": "client2"}, ) @@ -1274,8 +1272,6 @@ async def test_docdetails_merge_with_non_list_fields(): @pytest.mark.asyncio def test_docdetails_merge_with_list_fields(): # Test merging a DocDetail that was republished - # (so it has a new publication date but represents the same document) - # Create two DocDetails instances doc1 = DocDetails( citation="Citation 1", publication_date=datetime(2023, 1, 1), @@ -1285,9 +1281,9 @@ def test_docdetails_merge_with_list_fields(): ) doc2 = DocDetails( - citation="Citation 1", + citation=doc1.citation, publication_date=datetime(2024, 1, 1), - docname="Document 1", + docname=doc1.docname, dockey="key2", other={"bibtex_source": ["source2"], "client_source": ["client2"]}, ) From a8ea231a6bc6d230dda9e1aaa4ba58630ff2c4c3 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 09:57:58 -0800 Subject: [PATCH 12/16] make code more self-documented --- tests/test_paperqa.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 3aac50af..a6009f60 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -6,7 +6,7 @@ import textwrap from collections.abc import AsyncIterable, Sequence from copy import deepcopy -from datetime import datetime +from datetime import datetime, timedelta from io import BytesIO from pathlib import Path from typing import cast @@ -1243,17 +1243,19 @@ def test_dois_resolve_to_correct_journals(doi_journals): @pytest.mark.asyncio async def test_docdetails_merge_with_non_list_fields(): # Test merging a DocDetail that was republished + initial_date = datetime(2023, 1, 1) doc1 = DocDetails( citation="Citation 1", - publication_date=datetime(2023, 1, 1), + publication_date=initial_date, docname="Document 1", dockey="key1", other={"bibtex_source": "source1", "client_source": ["client1"]}, ) + later_publication_date = initial_date + timedelta(weeks=13) doc2 = DocDetails( citation=doc1.citation, - publication_date=datetime(2024, 1, 1), + publication_date=later_publication_date, docname=doc1.docname, dockey="key2", other={"bibtex_source": ["source2"], "client_source": "client2"}, @@ -1272,17 +1274,19 @@ async def test_docdetails_merge_with_non_list_fields(): @pytest.mark.asyncio def test_docdetails_merge_with_list_fields(): # Test merging a DocDetail that was republished + initial_date = datetime(2023, 1, 1) doc1 = DocDetails( citation="Citation 1", - publication_date=datetime(2023, 1, 1), + publication_date=initial_date, docname="Document 1", dockey="key1", other={"bibtex_source": ["source1"], "client_source": ["client1"]}, ) + later_publication_date = initial_date + timedelta(weeks=13) doc2 = DocDetails( citation=doc1.citation, - publication_date=datetime(2024, 1, 1), + publication_date=later_publication_date, docname=doc1.docname, dockey="key2", other={"bibtex_source": ["source2"], "client_source": ["client2"]}, From 69128f98a3279ce69e5a84842c83ebb53f3f5cf4 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 13:38:07 -0800 Subject: [PATCH 13/16] making the tests more documented --- tests/test_paperqa.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index a6009f60..3a383424 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1240,8 +1240,7 @@ def test_dois_resolve_to_correct_journals(doi_journals): assert details.journal == doi_journals["journal"] -@pytest.mark.asyncio -async def test_docdetails_merge_with_non_list_fields(): +def test_docdetails_merge_with_non_list_fields(): # Test merging a DocDetail that was republished initial_date = datetime(2023, 1, 1) doc1 = DocDetails( @@ -1257,18 +1256,20 @@ async def test_docdetails_merge_with_non_list_fields(): citation=doc1.citation, publication_date=later_publication_date, docname=doc1.docname, - dockey="key2", + dockey=doc1.dockey, other={"bibtex_source": ["source2"], "client_source": "client2"}, ) # Merge the two DocDetails instances merged_doc = doc1 + doc2 - # use set operations to check for containment - assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) - assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) - # Check that the merged_doc is an instance of DocDetails - assert isinstance(merged_doc, DocDetails) + assert {"source1", "source2"}.issubset( + merged_doc.other["bibtex_source"] + ), "Expected merge to keep both bibtex sources" + assert {"client1", "client2"}.issubset( + merged_doc.other["client_source"] + ), "Expected merge to keep both client sources" + assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails" @pytest.mark.asyncio @@ -1288,19 +1289,20 @@ def test_docdetails_merge_with_list_fields(): citation=doc1.citation, publication_date=later_publication_date, docname=doc1.docname, - dockey="key2", + dockey=doc1.dockey, other={"bibtex_source": ["source2"], "client_source": ["client2"]}, ) # Merge the two DocDetails instances merged_doc = doc1 + doc2 - # Check that 'other' fields are merged correctly - assert {"source1", "source2"}.issubset(merged_doc.other["bibtex_source"]) - assert {"client1", "client2"}.issubset(merged_doc.other["client_source"]) - - # Check that the merged_doc is an instance of DocDetails - assert isinstance(merged_doc, DocDetails) + assert {"source1", "source2"}.issubset( + merged_doc.other["bibtex_source"] + ), "Expected merge to keep both bibtex sources" + assert {"client1", "client2"}.issubset( + merged_doc.other["client_source"] + ), "Expected merge to keep both client sources" + assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails" @pytest.mark.vcr From 17d8c701ad8a983fc6e48d4bfb120d8449b28578 Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 13:48:48 -0800 Subject: [PATCH 14/16] remove unnecessary `@pytest.mark.asyncio` --- tests/test_paperqa.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 3a383424..72cbe5d6 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1272,7 +1272,6 @@ def test_docdetails_merge_with_non_list_fields(): assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails" -@pytest.mark.asyncio def test_docdetails_merge_with_list_fields(): # Test merging a DocDetail that was republished initial_date = datetime(2023, 1, 1) From f6009545328009d1677b736c48b3cbc3bf91b02b Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 14:05:55 -0800 Subject: [PATCH 15/16] document the tests --- tests/test_paperqa.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 72cbe5d6..fbe295ea 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1240,14 +1240,15 @@ def test_dois_resolve_to_correct_journals(doi_journals): assert details.journal == doi_journals["journal"] -def test_docdetails_merge_with_non_list_fields(): - # Test merging a DocDetail that was republished +def test_docdetails_merge_with_non_list_fields() -> None: + """Check republication where the source metadata is the same shape.""" initial_date = datetime(2023, 1, 1) doc1 = DocDetails( citation="Citation 1", publication_date=initial_date, docname="Document 1", dockey="key1", + # NOTE: doc1 has non-list bibtex_source and list client_source other={"bibtex_source": "source1", "client_source": ["client1"]}, ) @@ -1257,6 +1258,7 @@ def test_docdetails_merge_with_non_list_fields(): publication_date=later_publication_date, docname=doc1.docname, dockey=doc1.dockey, + # NOTE: doc2 has list bibtex_source and non-list client_source other={"bibtex_source": ["source2"], "client_source": "client2"}, ) @@ -1272,14 +1274,15 @@ def test_docdetails_merge_with_non_list_fields(): assert isinstance(merged_doc, DocDetails), "Merged doc should also be DocDetails" -def test_docdetails_merge_with_list_fields(): - # Test merging a DocDetail that was republished +def test_docdetails_merge_with_list_fields() -> None: + """Check republication where the source metadata is the same shape.""" initial_date = datetime(2023, 1, 1) doc1 = DocDetails( citation="Citation 1", publication_date=initial_date, docname="Document 1", dockey="key1", + # NOTE: doc1 has list bibtex_source and list client_source other={"bibtex_source": ["source1"], "client_source": ["client1"]}, ) @@ -1289,6 +1292,7 @@ def test_docdetails_merge_with_list_fields(): publication_date=later_publication_date, docname=doc1.docname, dockey=doc1.dockey, + # NOTE: doc2 has list bibtex_source and list client_source other={"bibtex_source": ["source2"], "client_source": ["client2"]}, ) From 7c4d21c33bfe51b6ef4e479c263e6e8072401dea Mon Sep 17 00:00:00 2001 From: Harry Vu Date: Fri, 10 Jan 2025 14:20:10 -0800 Subject: [PATCH 16/16] correct docstring --- tests/test_paperqa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index fbe295ea..690d719a 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1241,7 +1241,7 @@ def test_dois_resolve_to_correct_journals(doi_journals): def test_docdetails_merge_with_non_list_fields() -> None: - """Check republication where the source metadata is the same shape.""" + """Check republication where the source metadata has different shapes.""" initial_date = datetime(2023, 1, 1) doc1 = DocDetails( citation="Citation 1",