diff --git a/backend/app/api/docs/collections/info.md b/backend/app/api/docs/collections/info.md index 576046bd..65c48c7a 100644 --- a/backend/app/api/docs/collections/info.md +++ b/backend/app/api/docs/collections/info.md @@ -1,4 +1,6 @@ Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization, timestamps, and associated LLM service details (`llm_service_id` and `llm_service_name`). -Additionally, if the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. +If the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. + +Additionally, if you set the `include_url` parameter to true, a signed URL will be included in the response, which is a clickable link to access the retrieved document. If you don't set it to true, the URL will not be included in the response. diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index d19fad31..b9dc7b3f 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -12,8 +12,8 @@ CollectionJobCrud, DocumentCollectionCrud, ) +from app.core.cloud import get_cloud_storage from app.models import ( - DocumentPublic, CollectionJobStatus, CollectionActionType, CollectionJobCreate, @@ -32,6 +32,7 @@ create_collection as create_service, delete_collection as delete_service, ) +from app.services.documents.helpers import build_document_schemas logger = logging.getLogger(__name__) @@ -184,8 +185,16 @@ def collection_info( True, description="If true, include documents linked to this collection", ), - skip: int = Query(0, ge=0), - limit: int = Query(100, gt=0, le=100), + include_url: bool = Query( + True, description="Include a signed URL to access the document" + ), + limit: int + | None = Query( + None, + gt=0, + le=500, + description="Limit number of documents returned (default: all, max: 500)", + ), ): collection_crud = CollectionCrud(session, current_user.project_.id) collection = collection_crud.read_one(collection_id) @@ -194,9 +203,16 @@ def collection_info( if include_docs: document_collection_crud = DocumentCollectionCrud(session) - docs = document_collection_crud.read(collection, skip, limit) - collection_with_docs.documents = [ - DocumentPublic.model_validate(doc) for doc in docs - ] + documents = document_collection_crud.read(collection, skip=None, limit=limit) + + storage = None + if include_url and documents: + storage = get_cloud_storage( + session=session, project_id=current_user.project_.id + ) + + collection_with_docs.documents = build_document_schemas( + documents=documents, storage=storage, include_url=include_url + ) return APIResponse.success_response(collection_with_docs) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 16fdcfc6..58beb31b 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -11,12 +11,10 @@ Query, UploadFile, ) -from pydantic import HttpUrl from fastapi import Path as FastPath from app.api.deps import AuthContextDep, SessionDep from app.api.permissions import Permission, require_permission -from app.core.cloud import get_cloud_storage from app.crud import CollectionCrud, DocumentCrud from app.crud.rag import OpenAIAssistantCrud, OpenAIVectorStoreCrud from app.models import ( @@ -28,6 +26,7 @@ TransformationJobInfo, DocTransformationJobPublic, ) +from app.core.cloud import get_cloud_storage from app.services.collections.helpers import pick_service_for_documennt from app.services.documents.helpers import ( schedule_transformation, @@ -261,4 +260,5 @@ def doc_info( include_url=include_url, storage=storage, ) + return APIResponse.success_response(doc_schema) diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py index 90f8b80c..75ae2e26 100644 --- a/backend/app/tests/api/routes/collections/test_collection_info.py +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -185,3 +185,42 @@ def test_collection_info_not_found_returns_404( ) assert response.status_code == 404 + + +def test_collection_info_include_docs_and_url( + client: TestClient, + db: Session, + user_api_key_header, +) -> None: + """ + Test that when include_docs=true and include_url=true, + the endpoint returns documents with their URLs. + """ + project = get_project(db, "Dalgo") + collection = get_collection(db, project) + + document = link_document_to_collection(db, collection) + + response = client.get( + f"{settings.API_V1_STR}/collections/{collection.id}", + headers=user_api_key_header, + params={"include_docs": "true", "include_url": "true"}, + ) + + assert response.status_code == 200 + + data = response.json() + payload = data["data"] + + assert payload["id"] == str(collection.id) + + docs = payload.get("documents", []) + assert isinstance(docs, list) + assert len(docs) >= 1 + + doc_ids = {d["id"] for d in docs} + assert str(document.id) in doc_ids + + doc = next(d for d in docs if d["id"] == str(document.id)) + assert "signed_url" in doc + assert doc["signed_url"].startswith("https://")