Coverage for src / idx_api / embeddings / base.py: 65%
68 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:12 -0700
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:12 -0700
1"""Core embedding functions for PostgreSQL with pgvector.
3Provides app-level embedding generation via Ollama API and pgvector table creation.
4"""
6import httpx
7import numpy as np
8from pgvector.sqlalchemy import Vector
9from sqlalchemy import Column, DateTime, ForeignKey, Index, Integer, String, Text, text
10from sqlalchemy.orm import Session
12from idx_api.config import settings
13from idx_api.models.base import Base
16# SQLAlchemy Models for image-related tables with pgvector embeddings
19class PropertyImage(Base):
20 """Image metadata and descriptions for property photos."""
22 __tablename__ = "property_images"
24 id = Column(Integer, primary_key=True, autoincrement=True)
25 listing_id = Column(String(50), nullable=False, index=True)
26 image_url = Column(Text, nullable=False)
27 image_index = Column(Integer, nullable=False)
28 description = Column(Text)
29 room_type = Column(String(50))
30 created_at = Column(DateTime, server_default=text("CURRENT_TIMESTAMP"))
32 # Embeddings stored directly on the model (no separate vec tables)
33 description_embedding = Column(Vector(1024)) # mxbai-embed-large
34 visual_embedding = Column(Vector(1024)) # SigLIP
36 __table_args__ = (
37 Index("ix_property_images_listing_url", "listing_id", "image_url", unique=True),
38 )
41class ImageFeatureTag(Base):
42 """Structured feature tags for inverse footnotes."""
44 __tablename__ = "image_feature_tags"
46 id = Column(Integer, primary_key=True, autoincrement=True)
47 image_id = Column(Integer, ForeignKey("property_images.id"), nullable=False)
48 feature_type = Column(String(50), nullable=False)
49 feature_value = Column(String(255), nullable=False)
51 __table_args__ = (
52 Index("ix_image_feature_tags_value", "feature_value"),
53 Index("ix_image_feature_tags_type_value", "feature_type", "feature_value"),
54 Index("ix_image_feature_tags_unique", "image_id", "feature_type", "feature_value", unique=True),
55 )
58async def generate_embedding(text_content: str) -> list[float] | None:
59 """Generate embedding via Ollama API (app-level, not in-SQL).
61 Args:
62 text_content: Text to embed
64 Returns:
65 List of floats representing the embedding, or None on error
66 """
67 if not text_content or not text_content.strip():
68 return None
70 try:
71 async with httpx.AsyncClient(timeout=60.0) as client:
72 response = await client.post(
73 f"{settings.ollama_base_url}/api/embeddings",
74 json={
75 "model": settings.ollama_embed_model,
76 "prompt": text_content,
77 },
78 )
79 response.raise_for_status()
80 data = response.json()
81 return data.get("embedding")
82 except Exception as e:
83 print(f"Error generating embedding: {e}")
84 return None
87def generate_embedding_sync(text_content: str) -> list[float] | None:
88 """Synchronous version of embedding generation for non-async contexts.
90 Args:
91 text_content: Text to embed
93 Returns:
94 List of floats representing the embedding, or None on error
95 """
96 if not text_content or not text_content.strip():
97 return None
99 try:
100 with httpx.Client(timeout=60.0) as client:
101 response = client.post(
102 f"{settings.ollama_base_url}/api/embeddings",
103 json={
104 "model": settings.ollama_embed_model,
105 "prompt": text_content,
106 },
107 )
108 response.raise_for_status()
109 data = response.json()
110 return data.get("embedding")
111 except Exception as e:
112 print(f"Error generating embedding: {e}")
113 return None
116def embedding_to_pgvector(embedding: list[float]) -> str:
117 """Convert embedding list to pgvector string format.
119 Args:
120 embedding: List of floats
122 Returns:
123 String in pgvector format: '[0.1,0.2,0.3,...]'
124 """
125 return "[" + ",".join(str(x) for x in embedding) + "]"
128def create_vector_indexes(session: Session) -> None:
129 """Create pgvector IVFFlat indexes for fast similarity search.
131 IVFFlat indexes partition vectors into lists for approximate nearest neighbor search.
132 lists=100 is a good default for datasets up to ~1M vectors.
133 """
134 # Enable pgvector extension
135 session.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
137 # Create IVFFlat index for property embeddings
138 session.execute(
139 text("""
140 CREATE INDEX IF NOT EXISTS idx_properties_embedding
141 ON properties USING ivfflat (embedding vector_cosine_ops)
142 WITH (lists = 100)
143 """)
144 )
146 # Create IVFFlat index for agent embeddings
147 session.execute(
148 text("""
149 CREATE INDEX IF NOT EXISTS idx_agents_embedding
150 ON agents USING ivfflat (embedding vector_cosine_ops)
151 WITH (lists = 100)
152 """)
153 )
155 # Create IVFFlat index for brokerage embeddings
156 session.execute(
157 text("""
158 CREATE INDEX IF NOT EXISTS idx_brokerages_embedding
159 ON brokerages USING ivfflat (embedding vector_cosine_ops)
160 WITH (lists = 100)
161 """)
162 )
164 # Create IVFFlat index for image description embeddings
165 session.execute(
166 text("""
167 CREATE INDEX IF NOT EXISTS idx_property_images_description_embedding
168 ON property_images USING ivfflat (description_embedding vector_cosine_ops)
169 WITH (lists = 100)
170 """)
171 )
173 # Create IVFFlat index for image visual embeddings
174 session.execute(
175 text("""
176 CREATE INDEX IF NOT EXISTS idx_property_images_visual_embedding
177 ON property_images USING ivfflat (visual_embedding vector_cosine_ops)
178 WITH (lists = 100)
179 """)
180 )
182 session.commit()
185# Legacy compatibility - these were used by SQLite code
186def ensure_embedding_client(session: Session) -> None:
187 """No-op for PostgreSQL - embedding generation is app-level now."""
188 pass
191def init_embedding_client(session: Session) -> None:
192 """No-op for PostgreSQL - embedding generation is app-level now."""
193 pass
196def create_vector_tables(session: Session) -> None:
197 """No-op for PostgreSQL - tables are created via Alembic migrations.
199 Vector columns are part of main tables (properties, agents, etc.)
200 rather than separate virtual tables.
201 """
202 pass
205def create_image_tables(session: Session) -> None:
206 """No-op for PostgreSQL - tables are created via Alembic migrations.
208 See PropertyImage and ImageFeatureTag models above.
209 """
210 pass