What are Vector Databases?
Vector databases are special databases that can store and find information based on meaning, not just exact words. Think of them like a smart library where you can ask "find me books about cooking" and it understands you want recipes, food books, and kitchen guides.
Why Use Vector Databases?
Regular databases only find exact matches, but AI apps often need to find
similar meaning content. Vector databases solve this by:
•
Smart Search: Find content based on meaning, not just keywords
•
Similar Things: Find related documents, images, or ideas
•
Recommendations: Suggest things you might like
•
Question Answering: Find relevant information to answer questions
Popular Vector Databases
1. Pinecone
import pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
# Set up Pinecone
pinecone.init(api_key="your-api-key", environment="your-environment")
# Create word embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vectorstore = PineconeVectorStore.from_texts(
texts=["Your text here"],
embedding=embeddings,
index_name="your-index-name"
)
# Search for similar documents
docs = vectorstore.similarity_search("your query", k=5)
2. Weaviate
from langchain_weaviate import Weaviate
import weaviate
# Connect to Weaviate
client = weaviate.Client("http://localhost:8080")
# Create vector store
vectorstore = Weaviate(
client=client,
index_name="Documents",
text_key="text",
embedding=OpenAIEmbeddings()
)
# Add documents
vectorstore.add_texts(["Document 1", "Document 2"])
# Search
results = vectorstore.similarity_search("query", k=3)
3. Chroma
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vectorstore = Chroma.from_texts(
texts=["Your documents here"],
embedding=embeddings,
persist_directory="./chroma_db"
)
# Save the database
vectorstore.persist()
# Load existing database
vectorstore = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings
)
Building a Smart Search System
1. Document Processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
class DocumentProcessor:
def __init__(self, vector_store_name: str):
self.embeddings = OpenAIEmbeddings()
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
self.vectorstore = PineconeVectorStore(
index_name=vector_store_name,
embedding=self.embeddings
)
def process_document(self, file_path: str):
"""Process a document and add it to the vector store."""
# Load document
if file_path.endswith('.pdf'):
loader = PyPDFLoader(file_path)
else:
loader = TextLoader(file_path)
documents = loader.load()
# Split documents into smaller pieces
splits = self.text_splitter.split_documents(documents)
# Add to vector store
self.vectorstore.add_documents(splits)
return len(splits)
def search_documents(self, query: str, k: int = 5):
"""Search for relevant documents."""
return self.vectorstore.similarity_search(query, k=k)
2. Smart Search Strategies
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import EnsembleRetriever
class SmartSearcher:
def __init__(self, vectorstore):
self.vectorstore = vectorstore
# Create different search methods
self.similarity_searcher = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 10}
)
self.mmr_searcher = vectorstore.as_retriever(
search_type="mmr",
search_kwargs={"k": 10, "fetch_k": 20}
)
# Combine different search methods
self.combined_searcher = EnsembleRetriever(
retrievers=[self.similarity_searcher, self.mmr_searcher],
weights=[0.7, 0.3]
)
# Use AI to improve search results
llm = ChatOpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
self.smart_searcher = ContextualCompressionRetriever(
base_retriever=self.combined_searcher,
base_compressor=compressor
)
def search(self, query: str, use_smart_search: bool = True):
"""Search for documents using smart strategies."""
if use_smart_search:
return self.smart_searcher.get_relevant_documents(query)
else:
return self.combined_searcher.get_relevant_documents(query)
Making It Work Well
1. Speed Up Your App
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List
class FastVectorStore:
def __init__(self, vectorstore):
self.vectorstore = vectorstore
self.executor = ThreadPoolExecutor(max_workers=4)
async def search_many(self, queries: List[str]):
"""Search for multiple things at the same time."""
loop = asyncio.get_event_loop()
# Run searches in parallel
tasks = [
loop.run_in_executor(
self.executor,
self.vectorstore.similarity_search,
query,
5
)
for query in queries
]
results = await asyncio.gather(*tasks)
return results
def prepare_embeddings(self, texts: List[str]):
"""Prepare embeddings ahead of time for better speed."""
embeddings = OpenAIEmbeddings()
return embeddings.embed_documents(texts)
2. Remember Search Results
import redis
import hashlib
import json
from functools import lru_cache
class CachedVectorStore:
def __init__(self, vectorstore, redis_url: str = "redis://localhost:6379"):
self.vectorstore = vectorstore
self.redis_client = redis.from_url(redis_url)
def _get_cache_key(self, query: str) -> str:
"""Create a unique key for each search."""
return hashlib.md5(query.encode()).hexdigest()
@lru_cache(maxsize=1000)
def search_with_cache(self, query: str, k: int = 5):
"""Search with memory to avoid doing the same search twice."""
cache_key = f"search:{self._get_cache_key(query)}:{k}"
# Check if we already have this result
cached_result = self.redis_client.get(cache_key)
if cached_result:
return json.loads(cached_result)
# Do the search
results = self.vectorstore.similarity_search(query, k=k)
# Remember the result for 1 hour
self.redis_client.setex(
cache_key,
3600,
json.dumps([doc.dict() for doc in results])
)
return results
3. Watch How It Works
import logging
import time
from typing import Dict, Any
class MonitoredVectorStore:
def __init__(self, vectorstore):
self.vectorstore = vectorstore
self.logger = logging.getLogger(__name__)
def search_with_monitoring(self, query: str, k: int = 5) -> Dict[str, Any]:
"""Search and keep track of how long it takes."""
start_time = time.time()
try:
results = self.vectorstore.similarity_search(query, k=k)
# Log how long it took
duration = time.time() - start_time
self.logger.info(f"Search completed in {duration:.2f}s")
return {
"results": results,
"duration": duration,
"query": query,
"k": k,
"success": True
}
except Exception as e:
duration = time.time() - start_time
self.logger.error(f"Search failed after {duration:.2f}s: {str(e)}")
return {
"results": [],
"duration": duration,
"query": query,
"k": k,
"success": False,
"error": str(e)
}
Real Examples
1. Document Question Answering
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
class DocumentQASystem:
def __init__(self, vectorstore):
self.vectorstore = vectorstore
self.llm = ChatOpenAI(temperature=0)
self.qa_chain = RetrievalQA.from_chain_type(
llm=self.llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
)
def ask_question(self, question: str) -> str:
"""Ask a question about the documents."""
return self.qa_chain.run(question)
def ask_with_sources(self, question: str) -> Dict[str, Any]:
"""Ask a question and get the sources used."""
docs = self.vectorstore.similarity_search(question, k=3)
answer = self.llm.predict(
f"Based on the following context, answer the question: {question}\n\n"
f"Context: {' '.join([doc.page_content for doc in docs])}"
)
return {
"answer": answer,
"sources": [doc.metadata for doc in docs]
}
2. Smart Search API
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
app = FastAPI()
class SearchRequest(BaseModel):
query: str
k: int = 5
include_metadata: bool = True
class SearchResponse(BaseModel):
results: List[dict]
query: str
total_results: int
@app.post("/search", response_model=SearchResponse)
async def smart_search(request: SearchRequest):
try:
# Do the search
docs = vectorstore.similarity_search(request.query, k=request.k)
# Format the results
results = []
for doc in docs:
result = {"content": doc.page_content}
if request.include_metadata:
result["metadata"] = doc.metadata
results.append(result)
return SearchResponse(
results=results,
query=request.query,
total_results=len(results)
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {"status": "healthy"}
Good Practices
1. Manage Your Data
•
Split Smart: Choose good sizes for your document pieces
•
Add Labels: Include helpful information about your documents
•
Keep Versions: Track changes to your database
•
Backup: Save copies of your database regularly
2. Keep It Safe
•
Control Access: Only let authorized people use your system
•
Encrypt Data: Protect your data with encryption
•
Secure API: Use API keys and limit how often people can search
•
Log Everything: Keep records of who accessed what
3. Make It Fast
•
Spread the Load: Use multiple servers to handle more users
•
Balance Work: Distribute searches across different computers
•
Remember Results: Save common search results to avoid doing them again
•
Watch Performance: Keep track of how fast your system is
Summary
Vector databases are essential for building smart AI apps. By combining LangChain with vector databases like Pinecone, Weaviate, or Chroma, you can create powerful search systems that understand meaning.
Key things to remember:
•
Choose the right database for your needs and size
•
Split documents well for better search results
•
Use smart search methods to get better results
•
Make it fast for real users
•
Keep it safe and backed up
The combination of LangChain and vector databases opens up many possibilities for building intelligent apps that can understand and find information based on meaning. Start with simple implementations and add more features as you need them.