From 2a46736f62bcd64aed3fa9b91585081d173bb3eb Mon Sep 17 00:00:00 2001 From: Deep Mistry <67551614+dmistry1@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:02:34 -0400 Subject: [PATCH] CMR-10118: Update default paging functionality for CMR-STAC (#363) * CMR-10118: Updates default paging * CMR-10118: Fixes lint * CMR-10118: Address PR comments * CMR-10118: Fixes lint issues * CMR-10118: Addressing PR comments * CMR-10118: Updates github action checkout@v3 and setup-node@v3 to v4 --- .github/workflows/lint.yaml | 8 ++-- .github/workflows/run_tests.yaml | 4 +- serverless.dev.yml | 1 + serverless.yml | 1 + src/__tests__/providerCatalog.spec.ts | 56 ++++++++++++++++++++++++++- src/domains/collections.ts | 4 +- src/domains/stac.ts | 21 ++-------- src/routes/catalog.ts | 47 +++++++++++++++++----- 8 files changed, 107 insertions(+), 35 deletions(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 65b77da4..578f03c2 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -12,9 +12,9 @@ jobs: node_version: [18] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: ${{ matrix.node_version }} @@ -31,9 +31,9 @@ jobs: node_version: [18] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: ${{ matrix.node_version }} diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml index 62ee2607..450a6972 100644 --- a/.github/workflows/run_tests.yaml +++ b/.github/workflows/run_tests.yaml @@ -18,9 +18,9 @@ jobs: node_version: [18] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: ${{ matrix.node_version }} diff --git a/serverless.dev.yml b/serverless.dev.yml index 7dcb13ae..fc7bbc2c 100644 --- a/serverless.dev.yml +++ b/serverless.dev.yml @@ -9,6 +9,7 @@ provider: CMR_LB_URL: "https://cmr.sit.earthdata.nasa.gov" GRAPHQL_URL: "https://graphql.sit.earthdata.nasa.gov/api" STAC_VERSION: "1.0.0" + PAGE_SIZE: "100" functions: stac: diff --git a/serverless.yml b/serverless.yml index 6d1a293c..9b0f35ee 100644 --- a/serverless.yml +++ b/serverless.yml @@ -15,6 +15,7 @@ provider: CMR_LB_URL: ${param:cmr-lb-url, "http://${cf:${opt:stage}.servicesDnsName}"} GRAPHQL_URL: ${param:graphql-url, "https://graphql${self:custom.stagePrefix.${opt:stage}}.earthdata.nasa.gov/api"} STAC_VERSION: "1.0.0" + PAGE_SIZE: "100" LOG_LEVEL: INFO custom: diff --git a/src/__tests__/providerCatalog.spec.ts b/src/__tests__/providerCatalog.spec.ts index 6c8666a8..bd1ecf2f 100644 --- a/src/__tests__/providerCatalog.spec.ts +++ b/src/__tests__/providerCatalog.spec.ts @@ -13,8 +13,9 @@ import ItemSpec from "../../resources/item-spec/json-schema/item.json"; import { Link } from "../@types/StacCatalog"; import { createApp } from "../app"; -import * as Provider from "../domains/providers"; import * as Collections from "../domains/collections"; +import * as Provider from "../domains/providers"; +import * as stac from "../domains/stac"; import { generateSTACCollections } from "../utils/testUtils"; const stacApp = createApp(); @@ -165,6 +166,59 @@ describe("GET /:provider", () => { }); }); + describe("when there are more results available", () => { + it("includes a 'next' link with the correct query parameters", async () => { + sandbox.stub(stac, "CMR_QUERY_MAX").value(100); + sandbox + .stub(Provider, "getProviders") + .resolves([null, [{ "provider-id": "TEST", "short-name": "TEST" }]]); + const mockCollections = generateSTACCollections(100); + sandbox.stub(Collections, "getCollectionIds").resolves({ + count: 100, + cursor: "nextPageCursor", + items: mockCollections.map((coll) => ({ + id: `${coll.id}`, + title: coll.title ?? faker.random.words(4), + })), + }); + + const { body: catalog } = await request(stacApp).get(`/stac/TEST`); + + const nextLink = catalog.links.find((l: Link) => l.rel === "next"); + expect(nextLink).to.exist; + expect(nextLink.rel).to.equal("next"); + expect(nextLink.type).to.equal("application/json"); + expect(nextLink.title).to.equal("Next page of results"); + + const nextUrl = new URL(nextLink.href); + expect(nextUrl.pathname).to.equal("/stac/TEST"); + }); + }); + + describe("when there are no more results available", () => { + it("does not include a 'next' link", async () => { + sandbox.stub(stac, "CMR_QUERY_MAX").value(100); + sandbox + .stub(Provider, "getProviders") + .resolves([null, [{ "provider-id": "TEST", "short-name": "TEST" }]]); + + const mockCollections = generateSTACCollections(10); + sandbox.stub(Collections, "getCollectionIds").resolves({ + count: 10, + cursor: null, + items: mockCollections.map((coll) => ({ + id: `${coll.id}`, + title: coll.title ?? faker.random.words(4), + })), + }); + + const { body: catalog } = await request(stacApp).get("/stac/TEST"); + + const nextLink = catalog.links.find((l: Link) => l.rel === "next"); + expect(nextLink).to.not.exist; + }); + }); + describe(`given the provider has a collection`, () => { it("has a child link for that collection without query parameters", async function () { sandbox diff --git a/src/domains/collections.ts b/src/domains/collections.ts index ffc67d87..3d7f2f8f 100644 --- a/src/domains/collections.ts +++ b/src/domains/collections.ts @@ -16,7 +16,7 @@ import { import { cmrSpatialToExtent } from "./bounding-box"; -import { extractAssets, MAX_SIGNED_INTEGER, paginateQuery } from "./stac"; +import { CMR_QUERY_MAX, extractAssets, paginateQuery } from "./stac"; const CMR_ROOT = process.env.CMR_URL; const STAC_VERSION = process.env.STAC_VERSION ?? "1.0.0"; @@ -356,7 +356,7 @@ export const getAllCollectionIds = async ( cursor: string | null; items: { id: string; title: string }[]; }> => { - params.limit = MAX_SIGNED_INTEGER; + params.limit = CMR_QUERY_MAX; return await getCollectionIds(params, opts); }; diff --git a/src/domains/stac.ts b/src/domains/stac.ts index 860f7611..e2358cb1 100644 --- a/src/domains/stac.ts +++ b/src/domains/stac.ts @@ -205,8 +205,8 @@ export const extractAssets = ( {} as AssetLinks ); const GRAPHQL_URL = process.env.GRAPHQL_URL ?? "http://localhost:3013"; -export const CMR_QUERY_MAX = 2000; -export const MAX_SIGNED_INTEGER = 2 ** 31 - 1; + +export const CMR_QUERY_MAX = Number(process.env.PAGE_SIZE); const pointToQuery = (point: GeoJSONPoint) => point.coordinates.join(","); @@ -553,8 +553,7 @@ export const paginateQuery = async ( opts: { headers?: IncomingHttpHeaders; }, - handler: GraphQLHandler, - prevResults: unknown[] = [] + handler: GraphQLHandler ): Promise => { const paginatedParams = { ...params }; @@ -589,19 +588,7 @@ export const paginateQuery = async ( if (!data) throw new Error("No data returned from GraphQL during paginated query"); const { count, cursor, items } = data; - const totalResults = [...prevResults, ...items]; - const moreResultsAvailable = totalResults.length !== count && cursor != null; - const foundEnough = totalResults.length >= (params.limit ?? -1); - - if (moreResultsAvailable && !foundEnough) { - console.debug( - `Retrieved ${totalResults.length} of ${params.limit} for ${JSON.stringify(params, null, 2)}` - ); - const nextParams = mergeMaybe({ ...params }, { cursor }); - return await paginateQuery(gqlQuery, nextParams, opts, handler, totalResults); - } - - return { items: totalResults, count, cursor }; + return { items: items, count, cursor }; } catch (err: unknown) { if ( !(err instanceof Error) && diff --git a/src/routes/catalog.ts b/src/routes/catalog.ts index 7f516948..d378abc3 100644 --- a/src/routes/catalog.ts +++ b/src/routes/catalog.ts @@ -6,13 +6,14 @@ import { getAllCollectionIds } from "../domains/collections"; import { conformance } from "../domains/providers"; import { ServiceUnavailableError } from "../models/errors"; import { getBaseUrl, mergeMaybe, stacContext } from "../utils"; +import { CMR_QUERY_MAX, stringifyQuery } from "../domains/stac"; const STAC_VERSION = process.env.STAC_VERSION ?? "1.0.0"; -const generateSelfLinks = (req: Request): Links => { +const generateSelfLinks = (req: Request, nextCursor?: string | null, count?: number): Links => { const { stacRoot, path, self } = stacContext(req); - return [ + const links = [ { rel: "self", href: self, @@ -72,20 +73,46 @@ const generateSelfLinks = (req: Request): Links => { title: "HTML documentation", }, ]; + + const originalQuery = mergeMaybe(req.query, req.body); + + // Add a 'next' link if there are more results available + // This is determined by: + // 1. The presence of a nextCursor (indicating more results) + // 2. The number of collection equaling CMR_QUERY_MAX (100) + // The 'next' link includes the original query parameters plus the new cursor + if (nextCursor && count === CMR_QUERY_MAX) { + const nextResultsQuery = { ...originalQuery, cursor: nextCursor }; + + links.push({ + rel: "next", + href: `${stacRoot}${req.path}?${stringifyQuery(nextResultsQuery)}`, + type: "application/json", + title: "Next page of results", + }); + } + + return links; }; const providerCollections = async ( req: Request -): Promise<[null, { id: string; title: string }[]] | [string, null]> => { - const { headers, provider } = req; +): Promise<[null, { id: string; title: string }[], string | null] | [string, null]> => { + const { headers, provider, query } = req; const cloudOnly = headers["cloud-stac"] === "true" ? { cloudHosted: true } : {}; - const query = mergeMaybe({ provider: provider?.["provider-id"] }, { ...cloudOnly }); + const mergedQuery = mergeMaybe( + { + provider: provider?.["provider-id"], + cursor: query?.cursor, + }, + { ...cloudOnly } + ); try { - const { items } = await getAllCollectionIds(query, { headers }); - return [null, items]; + const { items, cursor } = await getAllCollectionIds(mergedQuery, { headers }); + return [null, items, cursor]; } catch (err) { console.error("A problem occurred querying for collections.", err); return [(err as Error).message, null]; @@ -94,15 +121,17 @@ const providerCollections = async ( export const providerCatalogHandler = async (req: Request, res: Response) => { const { provider } = req; + if (!provider) throw new ServiceUnavailableError("Could not retrieve provider information"); - const [err, collections] = await providerCollections(req); + const [err, collections, cursor] = await providerCollections(req); if (err) throw new ServiceUnavailableError(err as string); const { self } = stacContext(req); - const selfLinks = generateSelfLinks(req); + const selfLinks = generateSelfLinks(req, cursor, collections?.length); + const childLinks = (collections ?? []).map(({ id, title }) => ({ rel: "child", href: `${getBaseUrl(self)}/collections/${encodeURIComponent(id)}`,