From 4f38697834f8e372182cbebe0162ea314ff845ef Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 26 Dec 2025 14:04:59 +0530 Subject: [PATCH 1/4] collection: include signed url of documents --- backend/app/api/docs/collections/info.md | 4 +++- backend/app/api/routes/collections.py | 29 ++++++++++++++++++------ backend/app/api/routes/documents.py | 4 ++-- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/backend/app/api/docs/collections/info.md b/backend/app/api/docs/collections/info.md index 576046bd..69582812 100644 --- a/backend/app/api/docs/collections/info.md +++ b/backend/app/api/docs/collections/info.md @@ -1,4 +1,6 @@ Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization, timestamps, and associated LLM service details (`llm_service_id` and `llm_service_name`). -Additionally, if the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. +If the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. + +Additionally, if you set the ``include_url`` parameter to true, a signed URL will be included in the response, which is a clickable link to access the retrieved document. If you don't set it to true, the URL will not be included in the response. diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index d19fad31..b5e37e2f 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -12,8 +12,8 @@ CollectionJobCrud, DocumentCollectionCrud, ) +from app.core.cloud import get_cloud_storage from app.models import ( - DocumentPublic, CollectionJobStatus, CollectionActionType, CollectionJobCreate, @@ -32,6 +32,7 @@ create_collection as create_service, delete_collection as delete_service, ) +from app.services.documents.helpers import build_document_schemas logger = logging.getLogger(__name__) @@ -184,8 +185,15 @@ def collection_info( True, description="If true, include documents linked to this collection", ), - skip: int = Query(0, ge=0), - limit: int = Query(100, gt=0, le=100), + include_url: bool = Query( + False, description="Include a signed URL to access the document" + ), + limit: int = Query( + None, + gt=0, + le=500, + description="Limit number of documents returned (default: all, max: 500)", + ), ): collection_crud = CollectionCrud(session, current_user.project_.id) collection = collection_crud.read_one(collection_id) @@ -194,9 +202,16 @@ def collection_info( if include_docs: document_collection_crud = DocumentCollectionCrud(session) - docs = document_collection_crud.read(collection, skip, limit) - collection_with_docs.documents = [ - DocumentPublic.model_validate(doc) for doc in docs - ] + documents = document_collection_crud.read(collection, limit) + + storage = None + if include_url and documents: + storage = get_cloud_storage( + session=session, project_id=current_user.project_.id + ) + + collection_with_docs.documents = build_document_schemas( + documents=documents, storage=storage, include_url=include_url + ) return APIResponse.success_response(collection_with_docs) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 16fdcfc6..58beb31b 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -11,12 +11,10 @@ Query, UploadFile, ) -from pydantic import HttpUrl from fastapi import Path as FastPath from app.api.deps import AuthContextDep, SessionDep from app.api.permissions import Permission, require_permission -from app.core.cloud import get_cloud_storage from app.crud import CollectionCrud, DocumentCrud from app.crud.rag import OpenAIAssistantCrud, OpenAIVectorStoreCrud from app.models import ( @@ -28,6 +26,7 @@ TransformationJobInfo, DocTransformationJobPublic, ) +from app.core.cloud import get_cloud_storage from app.services.collections.helpers import pick_service_for_documennt from app.services.documents.helpers import ( schedule_transformation, @@ -261,4 +260,5 @@ def doc_info( include_url=include_url, storage=storage, ) + return APIResponse.success_response(doc_schema) From aa4e1a139eadab3770099cdce02fe5a62bbb174e Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 26 Dec 2025 14:27:15 +0530 Subject: [PATCH 2/4] pr review and adding test case --- backend/app/api/docs/collections/info.md | 2 +- backend/app/api/routes/collections.py | 3 +- .../collections/test_collection_info.py | 36 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/backend/app/api/docs/collections/info.md b/backend/app/api/docs/collections/info.md index 69582812..65c48c7a 100644 --- a/backend/app/api/docs/collections/info.md +++ b/backend/app/api/docs/collections/info.md @@ -3,4 +3,4 @@ timestamps, and associated LLM service details (`llm_service_id` and `llm_servic If the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. -Additionally, if you set the ``include_url`` parameter to true, a signed URL will be included in the response, which is a clickable link to access the retrieved document. If you don't set it to true, the URL will not be included in the response. +Additionally, if you set the `include_url` parameter to true, a signed URL will be included in the response, which is a clickable link to access the retrieved document. If you don't set it to true, the URL will not be included in the response. diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index b5e37e2f..93f92377 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -188,7 +188,8 @@ def collection_info( include_url: bool = Query( False, description="Include a signed URL to access the document" ), - limit: int = Query( + limit: int + | None = Query( None, gt=0, le=500, diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py index 90f8b80c..cc0f7dad 100644 --- a/backend/app/tests/api/routes/collections/test_collection_info.py +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -185,3 +185,39 @@ def test_collection_info_not_found_returns_404( ) assert response.status_code == 404 + + +def test_collection_info_include_docs_and_url( + client: TestClient, + db: Session, + user_api_key_header, +): + """ + Test that when include_docs=true and include_url=true, + the endpoint returns documents with their URLs. + """ + project = get_project(db, "Dalgo") + collection = get_collection(db, project) + + document = link_document_to_collection(db, collection) + + response = client.get( + f"{settings.API_V1_STR}/collections/{collection.id}", + headers=user_api_key_header, + params={"include_docs": "true", "include_url": "true"}, + ) + + assert response.status_code == 200 + + data = response.json() + payload = data["data"] + + assert payload["id"] == str(collection.id) + + docs = payload.get("documents", []) + assert isinstance(docs, list) + assert len(docs) >= 1 + + # Verify document has URL field when include_url=true + doc = docs[0] + assert doc["signed_url"].startswith("https://") From af9d9b870b5eeea3934b891e44c1aa9669e17f52 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 26 Dec 2025 14:40:30 +0530 Subject: [PATCH 3/4] coderabbit reviews --- backend/app/api/routes/collections.py | 2 +- .../tests/api/routes/collections/test_collection_info.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 93f92377..60996419 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -203,7 +203,7 @@ def collection_info( if include_docs: document_collection_crud = DocumentCollectionCrud(session) - documents = document_collection_crud.read(collection, limit) + documents = document_collection_crud.read(collection, skip=None, limit=limit) storage = None if include_url and documents: diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py index cc0f7dad..75ae2e26 100644 --- a/backend/app/tests/api/routes/collections/test_collection_info.py +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -191,7 +191,7 @@ def test_collection_info_include_docs_and_url( client: TestClient, db: Session, user_api_key_header, -): +) -> None: """ Test that when include_docs=true and include_url=true, the endpoint returns documents with their URLs. @@ -218,6 +218,9 @@ def test_collection_info_include_docs_and_url( assert isinstance(docs, list) assert len(docs) >= 1 - # Verify document has URL field when include_url=true - doc = docs[0] + doc_ids = {d["id"] for d in docs} + assert str(document.id) in doc_ids + + doc = next(d for d in docs if d["id"] == str(document.id)) + assert "signed_url" in doc assert doc["signed_url"].startswith("https://") From ade43445a24121d71ee1fbfce53cab3e8442c667 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 26 Dec 2025 14:48:32 +0530 Subject: [PATCH 4/4] include url default to true --- backend/app/api/routes/collections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 60996419..b9dc7b3f 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -186,7 +186,7 @@ def collection_info( description="If true, include documents linked to this collection", ), include_url: bool = Query( - False, description="Include a signed URL to access the document" + True, description="Include a signed URL to access the document" ), limit: int | None = Query(