Coverage for src / idx_api / embeddings / base.py: 65%

68 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-28 11:12 -0700

1"""Core embedding functions for PostgreSQL with pgvector. 

2 

3Provides app-level embedding generation via Ollama API and pgvector table creation. 

4""" 

5 

6import httpx 

7import numpy as np 

8from pgvector.sqlalchemy import Vector 

9from sqlalchemy import Column, DateTime, ForeignKey, Index, Integer, String, Text, text 

10from sqlalchemy.orm import Session 

11 

12from idx_api.config import settings 

13from idx_api.models.base import Base 

14 

15 

16# SQLAlchemy Models for image-related tables with pgvector embeddings 

17 

18 

19class PropertyImage(Base): 

20 """Image metadata and descriptions for property photos.""" 

21 

22 __tablename__ = "property_images" 

23 

24 id = Column(Integer, primary_key=True, autoincrement=True) 

25 listing_id = Column(String(50), nullable=False, index=True) 

26 image_url = Column(Text, nullable=False) 

27 image_index = Column(Integer, nullable=False) 

28 description = Column(Text) 

29 room_type = Column(String(50)) 

30 created_at = Column(DateTime, server_default=text("CURRENT_TIMESTAMP")) 

31 

32 # Embeddings stored directly on the model (no separate vec tables) 

33 description_embedding = Column(Vector(1024)) # mxbai-embed-large 

34 visual_embedding = Column(Vector(1024)) # SigLIP 

35 

36 __table_args__ = ( 

37 Index("ix_property_images_listing_url", "listing_id", "image_url", unique=True), 

38 ) 

39 

40 

41class ImageFeatureTag(Base): 

42 """Structured feature tags for inverse footnotes.""" 

43 

44 __tablename__ = "image_feature_tags" 

45 

46 id = Column(Integer, primary_key=True, autoincrement=True) 

47 image_id = Column(Integer, ForeignKey("property_images.id"), nullable=False) 

48 feature_type = Column(String(50), nullable=False) 

49 feature_value = Column(String(255), nullable=False) 

50 

51 __table_args__ = ( 

52 Index("ix_image_feature_tags_value", "feature_value"), 

53 Index("ix_image_feature_tags_type_value", "feature_type", "feature_value"), 

54 Index("ix_image_feature_tags_unique", "image_id", "feature_type", "feature_value", unique=True), 

55 ) 

56 

57 

58async def generate_embedding(text_content: str) -> list[float] | None: 

59 """Generate embedding via Ollama API (app-level, not in-SQL). 

60 

61 Args: 

62 text_content: Text to embed 

63 

64 Returns: 

65 List of floats representing the embedding, or None on error 

66 """ 

67 if not text_content or not text_content.strip(): 

68 return None 

69 

70 try: 

71 async with httpx.AsyncClient(timeout=60.0) as client: 

72 response = await client.post( 

73 f"{settings.ollama_base_url}/api/embeddings", 

74 json={ 

75 "model": settings.ollama_embed_model, 

76 "prompt": text_content, 

77 }, 

78 ) 

79 response.raise_for_status() 

80 data = response.json() 

81 return data.get("embedding") 

82 except Exception as e: 

83 print(f"Error generating embedding: {e}") 

84 return None 

85 

86 

87def generate_embedding_sync(text_content: str) -> list[float] | None: 

88 """Synchronous version of embedding generation for non-async contexts. 

89 

90 Args: 

91 text_content: Text to embed 

92 

93 Returns: 

94 List of floats representing the embedding, or None on error 

95 """ 

96 if not text_content or not text_content.strip(): 

97 return None 

98 

99 try: 

100 with httpx.Client(timeout=60.0) as client: 

101 response = client.post( 

102 f"{settings.ollama_base_url}/api/embeddings", 

103 json={ 

104 "model": settings.ollama_embed_model, 

105 "prompt": text_content, 

106 }, 

107 ) 

108 response.raise_for_status() 

109 data = response.json() 

110 return data.get("embedding") 

111 except Exception as e: 

112 print(f"Error generating embedding: {e}") 

113 return None 

114 

115 

116def embedding_to_pgvector(embedding: list[float]) -> str: 

117 """Convert embedding list to pgvector string format. 

118 

119 Args: 

120 embedding: List of floats 

121 

122 Returns: 

123 String in pgvector format: '[0.1,0.2,0.3,...]' 

124 """ 

125 return "[" + ",".join(str(x) for x in embedding) + "]" 

126 

127 

128def create_vector_indexes(session: Session) -> None: 

129 """Create pgvector IVFFlat indexes for fast similarity search. 

130 

131 IVFFlat indexes partition vectors into lists for approximate nearest neighbor search. 

132 lists=100 is a good default for datasets up to ~1M vectors. 

133 """ 

134 # Enable pgvector extension 

135 session.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) 

136 

137 # Create IVFFlat index for property embeddings 

138 session.execute( 

139 text(""" 

140 CREATE INDEX IF NOT EXISTS idx_properties_embedding 

141 ON properties USING ivfflat (embedding vector_cosine_ops) 

142 WITH (lists = 100) 

143 """) 

144 ) 

145 

146 # Create IVFFlat index for agent embeddings 

147 session.execute( 

148 text(""" 

149 CREATE INDEX IF NOT EXISTS idx_agents_embedding 

150 ON agents USING ivfflat (embedding vector_cosine_ops) 

151 WITH (lists = 100) 

152 """) 

153 ) 

154 

155 # Create IVFFlat index for brokerage embeddings 

156 session.execute( 

157 text(""" 

158 CREATE INDEX IF NOT EXISTS idx_brokerages_embedding 

159 ON brokerages USING ivfflat (embedding vector_cosine_ops) 

160 WITH (lists = 100) 

161 """) 

162 ) 

163 

164 # Create IVFFlat index for image description embeddings 

165 session.execute( 

166 text(""" 

167 CREATE INDEX IF NOT EXISTS idx_property_images_description_embedding 

168 ON property_images USING ivfflat (description_embedding vector_cosine_ops) 

169 WITH (lists = 100) 

170 """) 

171 ) 

172 

173 # Create IVFFlat index for image visual embeddings 

174 session.execute( 

175 text(""" 

176 CREATE INDEX IF NOT EXISTS idx_property_images_visual_embedding 

177 ON property_images USING ivfflat (visual_embedding vector_cosine_ops) 

178 WITH (lists = 100) 

179 """) 

180 ) 

181 

182 session.commit() 

183 

184 

185# Legacy compatibility - these were used by SQLite code 

186def ensure_embedding_client(session: Session) -> None: 

187 """No-op for PostgreSQL - embedding generation is app-level now.""" 

188 pass 

189 

190 

191def init_embedding_client(session: Session) -> None: 

192 """No-op for PostgreSQL - embedding generation is app-level now.""" 

193 pass 

194 

195 

196def create_vector_tables(session: Session) -> None: 

197 """No-op for PostgreSQL - tables are created via Alembic migrations. 

198 

199 Vector columns are part of main tables (properties, agents, etc.) 

200 rather than separate virtual tables. 

201 """ 

202 pass 

203 

204 

205def create_image_tables(session: Session) -> None: 

206 """No-op for PostgreSQL - tables are created via Alembic migrations. 

207 

208 See PropertyImage and ImageFeatureTag models above. 

209 """ 

210 pass