diff --git a/examples/lightspeed-stack-ollama.yaml b/examples/lightspeed-stack-ollama.yaml new file mode 100644 index 000000000..307d4289a --- /dev/null +++ b/examples/lightspeed-stack-ollama.yaml @@ -0,0 +1,59 @@ +# Lightspeed Stack Configuration for Ollama +# +# This configuration file sets up Lightspeed Stack to use Ollama for local LLM inference. +# Works in conjunction with examples/ollama-run.yaml for Llama Stack configuration. +# +# Quick Start: +# 1. Install dependencies: uv sync --group llslibdev +# 2. Install Ollama: https://ollama.com +# 3. Pull a model: ollama pull llama3.2:latest +# 4. Copy configs: cp examples/ollama-run.yaml run.yaml +# cp examples/lightspeed-stack-ollama.yaml lightspeed-stack.yaml +# 5. Start server: make run +# +# Deployment Modes: +# - Library mode (default): Llama Stack runs embedded in Lightspeed process +# - Remote mode: Llama Stack runs as separate service (requires manual start) +# + +name: Lightspeed Core Service (LCS) with Ollama +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true + +llama_stack: + # Use Llama Stack as embedded library (single process mode) + # This starts both Lightspeed Stack and Llama Stack in one process + use_as_library_client: true + library_client_config_path: ollama-run.yaml + + # Alternative: Use Llama Stack as separate service (uncomment below and comment above) + # This requires running "uv run llama stack run examples/ollama-run.yaml" separately + # use_as_library_client: false + # url: http://localhost:8321 + # api_key: xyzzy + +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +authentication: + module: "noop" + +inference: + # Default to the fastest local model + # Note: Ensure this model is pulled via: ollama pull llama3.2:latest + default_model: "llama3.2:latest" + default_provider: "ollama" + +# Optional: Configure conversation cache for better performance +# conversation_cache: +# type: "sqlite" +# sqlite: +# db_path: "/tmp/lightspeed-ollama-cache.db" diff --git a/examples/ollama-run.yaml b/examples/ollama-run.yaml new file mode 100644 index 000000000..898fa865c --- /dev/null +++ b/examples/ollama-run.yaml @@ -0,0 +1,246 @@ +# Llama Stack Configuration for Ollama Integration +# +# This configuration enables Lightspeed Stack to use Ollama for local LLM inference. +# Ollama allows running models locally without requiring cloud API keys or internet connectivity. +# +# Prerequisites: +# 1. Install Ollama: https://ollama.com +# 2. Pull at least one model: ollama pull llama3.2:latest +# 3. Ensure Ollama is running: ollama serve (or run Ollama app) +# +# Usage: +# cp examples/ollama-run.yaml run.yaml +# cp examples/lightspeed-stack-ollama.yaml lightspeed-stack.yaml +# make run +# +# ⚠️ KNOWN LIMITATION - AGENTS PROVIDER REQUIRES SAFETY API ⚠️ +# +# Current Status: SERVER STARTS ✓ but QUERIES FAIL ✗ +# +# The meta-reference agents provider in Llama Stack has a hard dependency on the +# safety API. However, the safety API (llama-guard) appears to require an OpenAI +# provider, creating a circular dependency that prevents pure Ollama-only operation. +# +# Configuration State: +# - agents API: ENABLED (required by Lightspeed /v1/query endpoint) +# - safety API: DISABLED (has OpenAI dependency) +# - Result: Server starts but agents provider cannot initialize without safety +# +# What Actually Works: +# ✓ Server startup and readiness checks pass +# ✓ Ollama provider loads and connects to localhost:11434 +# ✓ Embedding models via sentence-transformers +# ✓ Vector storage with FAISS +# ✓ Health monitoring endpoints +# +# What's Blocked: +# ✗ /v1/query endpoint (returns 500 - agents needs safety) +# ✗ /v1/query_v2 endpoint (same issue) +# ✗ Streaming query endpoints (same issue) +# ✗ Shield-based content moderation +# +# Workarounds: +# 1. Add minimal OpenAI config just for safety (hybrid approach) +# 2. Use direct /v1/inference/chat-completion endpoint (if available) +# 3. Wait for Llama Stack fix to make safety optional in agents provider +# +# An issue will be filed with the Llama Stack project to address this dependency. +# + +version: '2' +image_name: ollama-llama-stack-configuration + +apis: + - agents # Required by Lightspeed /v1/query endpoint (but has safety dependency - see below) + - datasetio + - eval + - files + - inference # Required - Ollama provider configured here + - post_training + # - safety # DISABLED: llama-guard has OpenAI dependency, blocking agents from working + - scoring + - telemetry + - tool_runtime + - vector_io + +benchmarks: [] +container_image: null +datasets: [] +external_providers_dir: null + +inference_store: + db_path: .llama/distributions/ollama/inference_store.db + type: sqlite + +logging: null + +metadata_store: + db_path: .llama/distributions/ollama/registry.db + namespace: null + type: sqlite + +providers: + files: + - provider_id: localfs + provider_type: inline::localfs + config: + storage_dir: /tmp/llama-stack-files + metadata_store: + type: sqlite + db_path: .llama/distributions/ollama/files_metadata.db + + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + db_path: .llama/distributions/ollama/agents_store.db + namespace: null + type: sqlite + responses_store: + db_path: .llama/distributions/ollama/responses_store.db + type: sqlite + + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + db_path: .llama/distributions/ollama/huggingface_datasetio.db + namespace: null + type: sqlite + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + db_path: .llama/distributions/ollama/localfs_datasetio.db + namespace: null + type: sqlite + + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + db_path: .llama/distributions/ollama/meta_reference_eval.db + namespace: null + type: sqlite + + inference: + # Embedding model for RAG - use sentence-transformers + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + # Local LLM inference via Ollama + - provider_id: ollama + provider_type: remote::ollama + config: + url: http://localhost:11434 # Default Ollama port + + post_training: + - provider_id: huggingface + provider_type: inline::huggingface-gpu + config: + checkpoint_format: huggingface + device: cpu + distributed_backend: null + dpo_output_dir: "." + + # safety: + # - provider_id: llama-guard + # provider_type: inline::llama-guard + # config: + # excluded_categories: [] + + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + # Disabled: These providers require OpenAI + # - provider_id: llm-as-judge + # provider_type: inline::llm-as-judge + # config: {} + # - provider_id: braintrust + # provider_type: inline::braintrust + # config: + # openai_api_key: '********' + + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: 'lightspeed-stack-ollama' + sinks: sqlite + sqlite_db_path: .llama/distributions/ollama/trace_store.db + + tool_runtime: + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + db_path: .llama/distributions/ollama/faiss_store.db + namespace: null + type: sqlite + +scoring_fns: [] + +server: + auth: null + host: null + port: 8321 + quota: null + tls_cafile: null + tls_certfile: null + tls_keyfile: null + +shields: [] + # Disabled - llama-guard requires specific Llama Guard models + # - shield_id: llama-guard-shield + # provider_id: llama-guard + # provider_shield_id: "llama3.2:latest" + +vector_dbs: + - vector_db_id: my_knowledge_base + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + provider_id: faiss + +models: + # Embedding model for RAG + - model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 + + # Local Ollama models (users must pull these first with: ollama pull ) + # Fast, small model - great for development + - model_id: llama3.2:latest + model_type: llm + provider_id: ollama + provider_model_id: llama3.2:latest + + # To add more models, first pull them with: ollama pull + # Then uncomment and configure: + # - model_id: qwen2.5:7b + # model_type: llm + # provider_id: ollama + # provider_model_id: qwen2.5:7b + # + # - model_id: llama3.1:8b + # model_type: llm + # provider_id: ollama + # provider_model_id: llama3.1:8b + +tool_groups: + - toolgroup_id: builtin::rag + provider_id: rag-runtime diff --git a/pyproject.toml b/pyproject.toml index 5c8ff3944..9c5357d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,6 +158,9 @@ llslibdev = [ "opentelemetry-instrumentation>=0.55b0", "blobfile>=3.0.0", "psutil>=7.0.0", + # API inference: remote::ollama + "ollama>=0.4.7", + "h11>=0.16.0", ] build = [ diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 4993da1a6..7b0c91225 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -687,22 +687,29 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche a summary of the LLM or agent's response content, the conversation ID, the list of parsed referenced documents, and token usage information. """ - available_input_shields = [ - shield.identifier - for shield in filter(is_input_shield, await client.shields.list()) - ] - available_output_shields = [ - shield.identifier - for shield in filter(is_output_shield, await client.shields.list()) - ] - if not available_input_shields and not available_output_shields: - logger.info("No available shields. Disabling safety") - else: - logger.info( - "Available input shields: %s, output shields: %s", - available_input_shields, - available_output_shields, - ) + # Try to get available shields, but gracefully handle if safety API is not available + try: + available_input_shields = [ + shield.identifier + for shield in filter(is_input_shield, await client.shields.list()) + ] + available_output_shields = [ + shield.identifier + for shield in filter(is_output_shield, await client.shields.list()) + ] + if not available_input_shields and not available_output_shields: + logger.info("No available shields. Disabling safety") + else: + logger.info( + "Available input shields: %s, output shields: %s", + available_input_shields, + available_output_shields, + ) + except (ValueError, KeyError) as e: + # Safety API not available (e.g., when using minimal Ollama configuration) + logger.info("Safety API not available, disabling shields: %s", e) + available_input_shields = [] + available_output_shields = [] # use system prompt from request or default one system_prompt = get_system_prompt(query_request, configuration) logger.debug("Using system prompt: %s", system_prompt) diff --git a/uv.lock b/uv.lock index 500394bc0..56a78a6a1 100644 --- a/uv.lock +++ b/uv.lock @@ -1399,11 +1399,13 @@ llslibdev = [ { name = "emoji" }, { name = "faiss-cpu" }, { name = "fire" }, + { name = "h11" }, { name = "langdetect" }, { name = "matplotlib" }, { name = "mcp" }, { name = "nltk" }, { name = "numpy" }, + { name = "ollama" }, { name = "opentelemetry-exporter-otlp" }, { name = "opentelemetry-instrumentation" }, { name = "opentelemetry-sdk" }, @@ -1480,11 +1482,13 @@ llslibdev = [ { name = "emoji", specifier = ">=2.1.0" }, { name = "faiss-cpu", specifier = ">=1.11.0" }, { name = "fire", specifier = ">=0.7.0" }, + { name = "h11", specifier = ">=0.16.0" }, { name = "langdetect", specifier = ">=1.0.9" }, { name = "matplotlib", specifier = ">=3.10.0" }, { name = "mcp", specifier = ">=1.9.4" }, { name = "nltk", specifier = ">=3.8.1" }, { name = "numpy", specifier = "==2.2.6" }, + { name = "ollama", specifier = ">=0.4.7" }, { name = "opentelemetry-exporter-otlp", specifier = ">=1.34.1" }, { name = "opentelemetry-instrumentation", specifier = ">=0.55b0" }, { name = "opentelemetry-sdk", specifier = ">=1.34.1" }, @@ -2002,6 +2006,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, ] +[[package]] +name = "ollama" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/5a/652dac4b7affc2b37b95386f8ae78f22808af09d720689e3d7a86b6ed98e/ollama-0.6.1.tar.gz", hash = "sha256:478c67546836430034b415ed64fa890fd3d1ff91781a9d548b3325274e69d7c6", size = 51620, upload-time = "2025-11-13T23:02:17.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/4f/4a617ee93d8208d2bcf26b2d8b9402ceaed03e3853c754940e2290fed063/ollama-0.6.1-py3-none-any.whl", hash = "sha256:fc4c984b345735c5486faeee67d8a265214a31cbb828167782dc642ce0a2bf8c", size = 14354, upload-time = "2025-11-13T23:02:16.292Z" }, +] + [[package]] name = "openai" version = "2.7.2"