ProjectTech4DevAI · nishika26 · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025 · Dec 26, 2025
diff --git a/backend/app/alembic/versions/041_adding_blob_column_in_collection_table.py b/backend/app/alembic/versions/041_adding_blob_column_in_collection_table.py
@@ -0,0 +1,76 @@
+"""adding blob column in collection table
+
+Revision ID: 041
+Revises: 040
+Create Date: 2025-12-24 11:03:44.620424
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+revision = "041"
+down_revision = "040"
+branch_labels = None
+depends_on = None
+
+provider_enum = postgresql.ENUM(
+    "openai",
+    name="providertype",
+    create_type=True,
+)
+
+
+def upgrade():
+    provider_enum.create(op.get_bind(), checkfirst=True)
+
+    op.add_column(
+        "collection",
+        sa.Column(
+            "collection_blob",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+            comment="Provider-specific collection parameters (name, description, chunking params etc.)",
+        ),
+    )
+
+    op.add_column(
+        "collection",
+        sa.Column(
+            "provider",
+            provider_enum,
+            nullable=True,
+            comment="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'gemini')",
+        ),
+    )
+
+    op.execute("UPDATE collection SET provider = 'openai' WHERE provider IS NULL")
+
+    op.alter_column(
+        "collection",
+        "provider",
+        nullable=False,
+        existing_type=provider_enum,
+    )
+
+    op.alter_column(
+        "collection",
+        "llm_service_name",
+        existing_type=sa.VARCHAR(),
+        comment="Name of the LLM service",
+        existing_comment="Name of the LLM provider's service",
+        existing_nullable=False,
+    )
+
+
+def downgrade():
+    op.alter_column(
+        "collection",
+        "llm_service_name",
+        existing_type=sa.VARCHAR(),
+        comment="Name of the LLM service provider",
+        existing_comment="Name of the LLM service",
+        existing_nullable=False,
+    )
+    op.drop_column("collection", "provider")
+    op.drop_column("collection", "collection_blob")
diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py
@@ -8,9 +8,14 @@
 
 from .collection import (
     Collection,
+    CreateCollectionParams,
+    CreateCollectionResult,
+    CreationRequest,
     CollectionPublic,
     CollectionIDPublic,
     CollectionWithDocsPublic,
+    DeletionRequest,
+    ProviderType,
 )
 from .collection_job import (
     CollectionActionType,

diff --git a/backend/app/models/collection/__init__.py b/backend/app/models/collection/__init__.py
@@ -0,0 +1,15 @@
+from app.models.collection.request import (
+    Collection,
+    CreationRequest,
+    DeletionRequest,
+    CallbackRequest,
+    AssistantOptions,
+    CreateCollectionParams,
+    ProviderType,
+)
+from app.models.collection.response import (
+    CollectionIDPublic,
+    CollectionPublic,
+    CollectionWithDocsPublic,
+    CreateCollectionResult,
+)
diff --git a/backend/app/models/collection.py → backend/app/models/collection/request.py b/backend/app/models/collection.py → backend/app/models/collection/request.py
@@ -1,15 +1,26 @@
 from datetime import datetime
+from enum import Enum
 from typing import Any, Literal
 from uuid import UUID, uuid4
 
 from pydantic import HttpUrl, model_validator
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import JSONB, ENUM
 from sqlmodel import Field, Relationship, SQLModel
 
 from app.core.util import now
-from app.models.document import DocumentPublic
+from app.models.organization import Organization
+from app.models.project import Project
 
-from .organization import Organization
-from .project import Project
+
+class ProviderType(str, Enum):
+    """Supported LLM providers for collections."""
+
+    OPENAI = "openai"
+
+
+#   BEDROCK = "bedrock"
+#   GEMINI = "gemini"
 
 
 class Collection(SQLModel, table=True):
@@ -20,6 +31,20 @@ class Collection(SQLModel, table=True):
         primary_key=True,
         sa_column_kwargs={"comment": "Unique identifier for the collection"},
     )
+
+    provider: ProviderType = Field(
+        sa_column=sa.Column(
+            ENUM(
+                "openai",
+                #   "bedrock",
+                #  "gemini",
+                name="providertype",
+                create_type=False,
+            ),
+            nullable=False,
+            comment="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'gemini', etc)",
+        ),
+    )
     llm_service_id: str = Field(
         nullable=False,
         sa_column_kwargs={
@@ -30,8 +55,13 @@ class Collection(SQLModel, table=True):
         nullable=False,
         sa_column_kwargs={"comment": "Name of the LLM service"},
     )
-
-    # Foreign keys
+    collection_blob: dict[str, Any] | None = Field(
+        sa_column=sa.Column(
+            JSONB,
+            nullable=True,
+            comment="Provider-specific collection parameters (name, description, chunking params etc.)",
+        )
+    )
     organization_id: int = Field(
         foreign_key="organization.id",
         nullable=False,
@@ -44,8 +74,6 @@ class Collection(SQLModel, table=True):
         ondelete="CASCADE",
         sa_column_kwargs={"comment": "Reference to the project"},
     )
-
-    # Timestamps
     inserted_at: datetime = Field(
         default_factory=now,
         sa_column_kwargs={"comment": "Timestamp when the collection was created"},
@@ -64,27 +92,55 @@ class Collection(SQLModel, table=True):
     project: Project = Relationship(back_populates="collections")
 
 
-# Request models
-class DocumentOptions(SQLModel):
-    documents: list[UUID] = Field(
-        description="List of document IDs",
+class DocumentInput(SQLModel):
+    """Document to be added to knowledge base."""
+
+    name: str | None = Field(
+        description="Display name for the document",
     )
-    batch_size: int = Field(
-        default=1,
-        description=(
-            "Number of documents to send to OpenAI in a single "
-            "transaction. See the `file_ids` parameter in the "
-            "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)."
-        ),
+    id: UUID = Field(
+        description="Reference to uploaded file/document in Kaapi",
+    )
+
+
+class CreateCollectionParams(SQLModel):
+    """Request-specific parameters for knowledge base creation."""
+
+    name: str | None = Field(
+        min_length=1,
+        description="Name of the knowledge base to create or update",
+    )
+    description: str | None = Field(
+        default=None,
+        description="Description of the knowledge base (required by Bedrock, optional for others)",
+    )
+    documents: list[DocumentInput] = Field(
+        default_factory=list,
+        description="List of documents to add to the knowledge base",
+    )
+    chunking_params: dict[str, Any] | None = Field(
+        default=None,
+        description="Chunking parameters for document processing (e.g., chunk_size, chunk_overlap)",
+    )
+    additional_params: dict[str, Any] | None = Field(
+        default=None,
+        description="Additional provider-specific parameters",
     )
 
     def model_post_init(self, __context: Any):
-        self.documents = list(set(self.documents))
+        """Deduplicate documents by document id."""
+        seen = set()
+        unique_docs = []
+        for doc in self.documents:
+            if doc.id not in seen:
+                seen.add(doc.id)
+                unique_docs.append(doc)
+        self.documents = unique_docs
 
 
 class AssistantOptions(SQLModel):
     # Fields to be passed along to OpenAI. They must be a subset of
-    # parameters accepted by the OpenAI.clien.beta.assistants.create
+    # parameters accepted by the OpenAI.client.beta.assistants.create
     # API.
     model: str | None = Field(
         default=None,
@@ -139,6 +195,8 @@ def norm(x: Any) -> Any:
 
 
 class CallbackRequest(SQLModel):
+    """Optional callback configuration for async job notifications."""
+
     callback_url: HttpUrl | None = Field(
         default=None,
         description="URL to call to report endpoint status",
@@ -148,45 +206,38 @@ class CallbackRequest(SQLModel):
 class ProviderOptions(SQLModel):
     """LLM provider configuration."""
 
-    provider: Literal["openai"] = Field(
-        default="openai", description="LLM provider to use for this collection"
+    provider: ProviderType = Field(
+        default=ProviderType.OPENAI,
+        description="LLM provider to use for this collection",
     )
 
-
-class CreationRequest(
-    DocumentOptions,
-    ProviderOptions,
-    AssistantOptions,
-    CallbackRequest,
-):
-    def extract_super_type(self, cls: "CreationRequest"):
-        for field_name in cls.model_fields.keys():
-            field_value = getattr(self, field_name)
-            yield (field_name, field_value)
-
-
-class DeletionRequest(CallbackRequest):
-    collection_id: UUID = Field(description="Collection to delete")
-
-
-# Response models
+    @model_validator(mode="before")
+    def normalize_provider(cls, values: dict[str, Any]) -> dict[str, Any]:
+        """Normalize provider value to lowercase for case-insensitive matching."""
+        if isinstance(values, dict) and "provider" in values:
+            provider = values["provider"]
+            if isinstance(provider, str):
+                values["provider"] = provider.lower()
+        return values
 
 
-class CollectionIDPublic(SQLModel):
-    id: UUID
+class CreationRequest(AssistantOptions, ProviderOptions, CallbackRequest):
+    """API request for collection creation"""
 
+    collection_params: CreateCollectionParams = Field(
+        ...,
+        description="Collection creation specific parameters (name, documents, etc.)",
+    )
+    batch_size: int = Field(
+        default=10,
+        ge=1,
+        le=500,
+        description="Number of documents to process in a single batch",
+    )
 
-class CollectionPublic(SQLModel):
-    id: UUID
-    llm_service_id: str
-    llm_service_name: str
-    project_id: int
-    organization_id: int
 
-    inserted_at: datetime
-    updated_at: datetime
-    deleted_at: datetime | None = None
+class DeletionRequest(ProviderOptions, CallbackRequest):
 
+    """API request for collection deletion"""
 
-class CollectionWithDocsPublic(CollectionPublic):
-    documents: list[DocumentPublic] | None = None
+    collection_id: UUID = Field(description="Collection to delete")
diff --git a/backend/app/models/collection/response.py b/backend/app/models/collection/response.py
@@ -0,0 +1,33 @@
+from datetime import datetime
+from typing import Any
+from uuid import UUID
+
+from sqlmodel import SQLModel
+
+from app.models.document import DocumentPublic
+
+
+class CreateCollectionResult(SQLModel):
+    llm_service_id: str
+    llm_service_name: str
+    collection_blob: dict[str, Any]
+
+
+class CollectionIDPublic(SQLModel):
+    id: UUID
+
+
+class CollectionPublic(SQLModel):
+    id: UUID
+    llm_service_id: str
+    llm_service_name: str
+    project_id: int
+    organization_id: int
+
+    inserted_at: datetime
+    updated_at: datetime
+    deleted_at: datetime | None = None
+
+
+class CollectionWithDocsPublic(CollectionPublic):
+    documents: list[DocumentPublic] | None = None