Core modules: - Laws: CRUD, search, AI-powered QA - Analysis: legal research and case management - Contracts: lifecycle management with templates - Signatures: electronic signature workflow Infrastructure: - FastAPI + SQLite + async SQLAlchemy - Docker deployment support - 54 unit tests passing Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
"""Vector service for embedding and similarity search."""
|
|
import numpy as np
|
|
from typing import List, Dict, Any, Optional
|
|
import httpx
|
|
|
|
from app.core.config import settings
|
|
|
|
|
|
class VectorService:
|
|
"""Service for vector embeddings and similarity search."""
|
|
|
|
def __init__(self):
|
|
self.api_base = settings.EMBEDDING_API_BASE
|
|
self.api_key = settings.EMBEDDING_API_KEY or settings.LLM_API_KEY
|
|
self.model = settings.EMBEDDING_MODEL
|
|
self.dimension = settings.EMBEDDING_DIMENSION
|
|
|
|
async def get_embedding(self, text: str) -> List[float]:
|
|
"""Get embedding for a text using external API."""
|
|
if not self.api_key:
|
|
# Return mock embedding for testing
|
|
return [0.0] * self.dimension
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
f"{self.api_base}/embeddings",
|
|
headers={
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": self.model,
|
|
"input": text,
|
|
},
|
|
timeout=30.0,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data["data"][0]["embedding"]
|
|
|
|
async def get_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
"""Get embeddings for multiple texts."""
|
|
if not self.api_key:
|
|
return [[0.0] * self.dimension for _ in texts]
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
f"{self.api_base}/embeddings",
|
|
headers={
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"model": self.model,
|
|
"input": texts,
|
|
},
|
|
timeout=60.0,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return [item["embedding"] for item in data["data"]]
|
|
|
|
def cosine_similarity(
|
|
self,
|
|
vec1: List[float],
|
|
vec2: List[float]
|
|
) -> float:
|
|
"""Calculate cosine similarity between two vectors."""
|
|
arr1 = np.array(vec1)
|
|
arr2 = np.array(vec2)
|
|
|
|
dot_product = np.dot(arr1, arr2)
|
|
norm1 = np.linalg.norm(arr1)
|
|
norm2 = np.linalg.norm(arr2)
|
|
|
|
if norm1 == 0 or norm2 == 0:
|
|
return 0.0
|
|
|
|
return float(dot_product / (norm1 * norm2))
|
|
|
|
def search_similar(
|
|
self,
|
|
query_embedding: List[float],
|
|
stored_vectors: List[Dict[str, Any]],
|
|
top_k: int = 5
|
|
) -> List[Dict[str, Any]]:
|
|
"""Search for similar vectors."""
|
|
results = []
|
|
|
|
for item in stored_vectors:
|
|
similarity = self.cosine_similarity(
|
|
query_embedding,
|
|
item["embedding"]
|
|
)
|
|
results.append({
|
|
"id": item["id"],
|
|
"similarity": similarity,
|
|
})
|
|
|
|
# Sort by similarity descending
|
|
results.sort(key=lambda x: x["similarity"], reverse=True)
|
|
|
|
return results[:top_k]
|
|
|
|
|
|
# Singleton instance
|
|
vector_service = VectorService()
|