Coverage for src / idx_api / embeddings / properties.py: 8%

143 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-28 11:12 -0700

1"""Property embedding functions for semantic listing search with PostgreSQL/pgvector. 

2 

3Handles property indexing, search, and text representation building. 

4""" 

5 

6import datetime 

7import json 

8 

9from sqlalchemy import text 

10from sqlalchemy.orm import Session 

11 

12from idx_api.embeddings.base import ( 

13 embedding_to_pgvector, 

14 generate_embedding_sync, 

15) 

16 

17 

18def _get_era_description(year_built: int) -> str: 

19 """Get era description for a property based on year built. 

20 

21 Args: 

22 year_built: Year the property was built 

23 

24 Returns: 

25 Era description string 

26 """ 

27 current_year = datetime.datetime.now().year 

28 

29 if year_built >= current_year: 

30 return "new construction" 

31 elif year_built >= current_year - 5: 

32 return "recently built" 

33 elif year_built >= 2000: 

34 return "modern" 

35 elif year_built >= 1980: 

36 return "established" 

37 elif year_built >= 1960: 

38 return "mid-century" 

39 elif year_built >= 1940: 

40 return "vintage" 

41 else: 

42 return "historic" 

43 

44 

45def _parse_json_features(value) -> list[str]: 

46 """Safely parse JSON feature arrays. 

47 

48 Args: 

49 value: JSON string, list, or None 

50 

51 Returns: 

52 List of feature strings 

53 """ 

54 if not value: 

55 return [] 

56 if isinstance(value, list): 

57 return [str(v) for v in value if v] 

58 if isinstance(value, str): 

59 try: 

60 parsed = json.loads(value) 

61 if isinstance(parsed, list): 

62 return [str(v) for v in parsed if v] 

63 except (json.JSONDecodeError, TypeError): 

64 pass 

65 return [] 

66 

67 

68def build_property_text(property_data: dict) -> str: 

69 """Build rich text representation of a property for embedding. 

70 

71 Creates a comprehensive text representation optimized for semantic search, 

72 including location, specs, features, and descriptive content that buyers 

73 typically search for. 

74 

75 Args: 

76 property_data: Property data dict from database 

77 

78 Returns: 

79 Text representation for semantic search embedding 

80 """ 

81 parts = [] 

82 

83 # Property type and subtype 

84 prop_type = property_data.get("property_type") 

85 prop_subtype = property_data.get("property_sub_type") 

86 if prop_subtype and prop_type: 

87 parts.append(f"{prop_subtype} {prop_type}") 

88 elif prop_type: 

89 parts.append(f"{prop_type} property") 

90 

91 # Location with county context 

92 city = property_data.get("city") 

93 state = property_data.get("state_or_province") 

94 county = property_data.get("county_or_parish") 

95 location_parts = [] 

96 if city: 

97 location_parts.append(city) 

98 if county: 

99 location_parts.append(f"{county} County") 

100 if state: 

101 location_parts.append(state) 

102 if location_parts: 

103 parts.append(", ".join(location_parts)) 

104 

105 # Address for specific searches 

106 address_parts = [ 

107 property_data.get("street_number"), 

108 property_data.get("street_name"), 

109 property_data.get("street_suffix"), 

110 ] 

111 address = " ".join(filter(None, address_parts)) 

112 if address: 

113 parts.append(address) 

114 

115 # Specs with natural language 

116 beds = property_data.get("bedrooms_total") 

117 baths = property_data.get("bathrooms_total_integer") 

118 baths_full = property_data.get("bathrooms_full") 

119 baths_half = property_data.get("bathrooms_half") 

120 sqft = property_data.get("living_area") 

121 

122 if beds: 

123 parts.append(f"{beds} bedroom" + ("s" if beds > 1 else "")) 

124 if baths: 

125 bath_detail = f"{baths} bathroom" + ("s" if baths > 1 else "") 

126 if baths_full and baths_half: 

127 bath_detail += f" ({baths_full} full, {baths_half} half)" 

128 parts.append(bath_detail) 

129 if sqft: 

130 parts.append(f"{int(sqft):,} square feet") 

131 

132 # Year built with era context 

133 year_built = property_data.get("year_built") 

134 if year_built: 

135 era = _get_era_description(year_built) 

136 parts.append(f"built {year_built} ({era})") 

137 

138 # Stories for single-story searches 

139 stories = property_data.get("stories") 

140 if stories: 

141 if stories == 1: 

142 parts.append("single story ranch") 

143 elif stories == 2: 

144 parts.append("two story") 

145 elif stories >= 3: 

146 parts.append(f"{stories} stories") 

147 

148 # Lot size for acreage/land searches 

149 lot_size = property_data.get("lot_size_area") 

150 if lot_size: 

151 if lot_size >= 1: 

152 parts.append(f"{lot_size:.2f} acre lot") 

153 else: 

154 # Convert to sq ft for small lots 

155 sqft_lot = lot_size * 43560 

156 parts.append(f"{int(sqft_lot):,} sq ft lot") 

157 

158 # Garage 

159 garage = property_data.get("garage_spaces") 

160 if garage: 

161 parts.append(f"{garage} car garage") 

162 

163 # Heating features (buyers search for specific types) 

164 heating = _parse_json_features(property_data.get("heating")) 

165 if heating: 

166 parts.append(f"heating: {', '.join(heating)}") 

167 

168 # Cooling features 

169 cooling = _parse_json_features(property_data.get("cooling")) 

170 if cooling: 

171 parts.append(f"cooling: {', '.join(cooling)}") 

172 

173 # Architectural style 

174 arch_style = _parse_json_features(property_data.get("architectural_style")) 

175 if arch_style: 

176 parts.append(f"style: {', '.join(arch_style)}") 

177 

178 # Appliances (buyers search: "stainless steel", "gas range") 

179 appliances = _parse_json_features(property_data.get("appliances")) 

180 if appliances: 

181 parts.append(f"appliances: {', '.join(appliances)}") 

182 

183 # Interior features (buyers search: "fireplace", "hardwood floors") 

184 interior = _parse_json_features(property_data.get("interior_features")) 

185 if interior: 

186 parts.append(f"interior: {', '.join(interior)}") 

187 

188 # Exterior features (buyers search: "pool", "deck", "patio") 

189 exterior = _parse_json_features(property_data.get("exterior_features")) 

190 if exterior: 

191 parts.append(f"exterior: {', '.join(exterior)}") 

192 

193 # Description/remarks - increased limit for more context 

194 remarks = property_data.get("public_remarks") 

195 if remarks: 

196 # Truncate long descriptions but keep more content 

197 if len(remarks) > 1200: 

198 remarks = remarks[:1200] + "..." 

199 parts.append(remarks) 

200 

201 return " | ".join(parts) 

202 

203 

204def index_property(session: Session, listing_id: str, text_content: str) -> None: 

205 """Index a property for vector search using pgvector. 

206 

207 Args: 

208 session: Database session 

209 listing_id: MLS listing ID 

210 text_content: Text representation of the property 

211 """ 

212 # Generate embedding via Ollama API 

213 embedding = generate_embedding_sync(text_content) 

214 

215 if embedding is None: 

216 print(f"Failed to generate embedding for {listing_id}") 

217 return 

218 

219 # Store embedding directly in properties table 

220 session.execute( 

221 text(""" 

222 UPDATE properties 

223 SET embedding = :embedding::vector 

224 WHERE listing_id = :listing_id 

225 """), 

226 { 

227 "listing_id": listing_id, 

228 "embedding": embedding_to_pgvector(embedding), 

229 }, 

230 ) 

231 

232 

233def search_properties( 

234 session: Session, 

235 query: str, 

236 limit: int = 10, 

237) -> list[dict]: 

238 """Search properties by semantic similarity using pgvector. 

239 

240 Args: 

241 session: Database session 

242 query: Search query (e.g., "3 bedroom with pool near downtown") 

243 limit: Maximum results to return 

244 

245 Returns: 

246 List of matching properties with similarity scores 

247 """ 

248 # Generate query embedding 

249 query_embedding = generate_embedding_sync(query) 

250 

251 if query_embedding is None: 

252 return [] 

253 

254 result = session.execute( 

255 text(""" 

256 SELECT 

257 p.listing_id, 

258 p.street_number, 

259 p.street_name, 

260 p.street_suffix, 

261 p.city, 

262 p.state_or_province, 

263 p.postal_code, 

264 p.list_price, 

265 p.bedrooms_total, 

266 p.bathrooms_total_integer, 

267 p.living_area, 

268 p.property_type, 

269 p.standard_status, 

270 p.primary_photo_url, 

271 p.embedding <=> :query_embedding::vector AS similarity_distance 

272 FROM properties p 

273 WHERE p.embedding IS NOT NULL 

274 ORDER BY p.embedding <=> :query_embedding::vector 

275 LIMIT :limit 

276 """), 

277 { 

278 "query_embedding": embedding_to_pgvector(query_embedding), 

279 "limit": limit, 

280 }, 

281 ) 

282 

283 return [ 

284 { 

285 "listing_id": row.listing_id, 

286 "address": " ".join(filter(None, [row.street_number, row.street_name, row.street_suffix])), 

287 "city": row.city, 

288 "state": row.state_or_province, 

289 "postal_code": row.postal_code, 

290 "price": row.list_price, 

291 "beds": row.bedrooms_total, 

292 "baths": row.bathrooms_total_integer, 

293 "sqft": row.living_area, 

294 "property_type": row.property_type, 

295 "status": row.standard_status, 

296 "photo_url": row.primary_photo_url, 

297 "similarity": 1.0 - row.similarity_distance, 

298 } 

299 for row in result.fetchall() 

300 ] 

301 

302 

303def reindex_all_properties(session: Session, limit: int | None = None) -> int: 

304 """Reindex all properties for vector search. 

305 

306 Args: 

307 session: Database session 

308 limit: Optional limit on number of properties to index (for testing) 

309 

310 Returns: 

311 Number of properties indexed 

312 """ 

313 query = """ 

314 SELECT * 

315 FROM properties 

316 WHERE mlg_can_view = true 

317 AND standard_status = 'Active' 

318 """ 

319 if limit: 

320 query += f" LIMIT {limit}" 

321 

322 result = session.execute(text(query)) 

323 

324 count = 0 

325 for row in result.fetchall(): 

326 row_dict = row._asdict() 

327 text_content = build_property_text(row_dict) 

328 index_property(session, row_dict['listing_id'], text_content) 

329 count += 1 

330 # Commit periodically to avoid holding locks too long 

331 if count % 50 == 0: 

332 session.commit() 

333 

334 session.commit() 

335 return count