Article4
Here’s a production-ready vector similarity search implementation:
import numpy as np
from typing import List, Tuple
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors."""
dot = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
return dot / (norm_a * norm_b + 1e-8)
class VectorSearchIndex:
def __init__(self, dimension: int = 768):
self.dimension = dimension
self.vectors: List[np.ndarray] = []
self.metadata: List[dict] = []
def add(self, vector: np.ndarray, meta: dict = None):
"""Add a vector to the index with optional metadata."""
assert vector.shape[0] == self.dimension
self.vectors.append(vector)
self.metadata.append(meta or {})
def search(
self,
query: np.ndarray,
top_k: int = 5
) -> List[Tuple[float, dict]]:
"""Find top-k most similar vectors using cosine distance."""
similarities = [
(cosine_similarity(query, vec), self.metadata[i])
for i, vec in enumerate(self.vectors)
]
similarities.sort(key=lambda x: x[0], reverse=True)
return similarities[:top_k]
For production use, consider using FAISS for GPU-accelerated search, or ScaNN for billion-scale datasets.
