Coverage for src / idx_api / embeddings / properties.py: 8%
143 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:12 -0700
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:12 -0700
1"""Property embedding functions for semantic listing search with PostgreSQL/pgvector.
3Handles property indexing, search, and text representation building.
4"""
6import datetime
7import json
9from sqlalchemy import text
10from sqlalchemy.orm import Session
12from idx_api.embeddings.base import (
13 embedding_to_pgvector,
14 generate_embedding_sync,
15)
18def _get_era_description(year_built: int) -> str:
19 """Get era description for a property based on year built.
21 Args:
22 year_built: Year the property was built
24 Returns:
25 Era description string
26 """
27 current_year = datetime.datetime.now().year
29 if year_built >= current_year:
30 return "new construction"
31 elif year_built >= current_year - 5:
32 return "recently built"
33 elif year_built >= 2000:
34 return "modern"
35 elif year_built >= 1980:
36 return "established"
37 elif year_built >= 1960:
38 return "mid-century"
39 elif year_built >= 1940:
40 return "vintage"
41 else:
42 return "historic"
45def _parse_json_features(value) -> list[str]:
46 """Safely parse JSON feature arrays.
48 Args:
49 value: JSON string, list, or None
51 Returns:
52 List of feature strings
53 """
54 if not value:
55 return []
56 if isinstance(value, list):
57 return [str(v) for v in value if v]
58 if isinstance(value, str):
59 try:
60 parsed = json.loads(value)
61 if isinstance(parsed, list):
62 return [str(v) for v in parsed if v]
63 except (json.JSONDecodeError, TypeError):
64 pass
65 return []
68def build_property_text(property_data: dict) -> str:
69 """Build rich text representation of a property for embedding.
71 Creates a comprehensive text representation optimized for semantic search,
72 including location, specs, features, and descriptive content that buyers
73 typically search for.
75 Args:
76 property_data: Property data dict from database
78 Returns:
79 Text representation for semantic search embedding
80 """
81 parts = []
83 # Property type and subtype
84 prop_type = property_data.get("property_type")
85 prop_subtype = property_data.get("property_sub_type")
86 if prop_subtype and prop_type:
87 parts.append(f"{prop_subtype} {prop_type}")
88 elif prop_type:
89 parts.append(f"{prop_type} property")
91 # Location with county context
92 city = property_data.get("city")
93 state = property_data.get("state_or_province")
94 county = property_data.get("county_or_parish")
95 location_parts = []
96 if city:
97 location_parts.append(city)
98 if county:
99 location_parts.append(f"{county} County")
100 if state:
101 location_parts.append(state)
102 if location_parts:
103 parts.append(", ".join(location_parts))
105 # Address for specific searches
106 address_parts = [
107 property_data.get("street_number"),
108 property_data.get("street_name"),
109 property_data.get("street_suffix"),
110 ]
111 address = " ".join(filter(None, address_parts))
112 if address:
113 parts.append(address)
115 # Specs with natural language
116 beds = property_data.get("bedrooms_total")
117 baths = property_data.get("bathrooms_total_integer")
118 baths_full = property_data.get("bathrooms_full")
119 baths_half = property_data.get("bathrooms_half")
120 sqft = property_data.get("living_area")
122 if beds:
123 parts.append(f"{beds} bedroom" + ("s" if beds > 1 else ""))
124 if baths:
125 bath_detail = f"{baths} bathroom" + ("s" if baths > 1 else "")
126 if baths_full and baths_half:
127 bath_detail += f" ({baths_full} full, {baths_half} half)"
128 parts.append(bath_detail)
129 if sqft:
130 parts.append(f"{int(sqft):,} square feet")
132 # Year built with era context
133 year_built = property_data.get("year_built")
134 if year_built:
135 era = _get_era_description(year_built)
136 parts.append(f"built {year_built} ({era})")
138 # Stories for single-story searches
139 stories = property_data.get("stories")
140 if stories:
141 if stories == 1:
142 parts.append("single story ranch")
143 elif stories == 2:
144 parts.append("two story")
145 elif stories >= 3:
146 parts.append(f"{stories} stories")
148 # Lot size for acreage/land searches
149 lot_size = property_data.get("lot_size_area")
150 if lot_size:
151 if lot_size >= 1:
152 parts.append(f"{lot_size:.2f} acre lot")
153 else:
154 # Convert to sq ft for small lots
155 sqft_lot = lot_size * 43560
156 parts.append(f"{int(sqft_lot):,} sq ft lot")
158 # Garage
159 garage = property_data.get("garage_spaces")
160 if garage:
161 parts.append(f"{garage} car garage")
163 # Heating features (buyers search for specific types)
164 heating = _parse_json_features(property_data.get("heating"))
165 if heating:
166 parts.append(f"heating: {', '.join(heating)}")
168 # Cooling features
169 cooling = _parse_json_features(property_data.get("cooling"))
170 if cooling:
171 parts.append(f"cooling: {', '.join(cooling)}")
173 # Architectural style
174 arch_style = _parse_json_features(property_data.get("architectural_style"))
175 if arch_style:
176 parts.append(f"style: {', '.join(arch_style)}")
178 # Appliances (buyers search: "stainless steel", "gas range")
179 appliances = _parse_json_features(property_data.get("appliances"))
180 if appliances:
181 parts.append(f"appliances: {', '.join(appliances)}")
183 # Interior features (buyers search: "fireplace", "hardwood floors")
184 interior = _parse_json_features(property_data.get("interior_features"))
185 if interior:
186 parts.append(f"interior: {', '.join(interior)}")
188 # Exterior features (buyers search: "pool", "deck", "patio")
189 exterior = _parse_json_features(property_data.get("exterior_features"))
190 if exterior:
191 parts.append(f"exterior: {', '.join(exterior)}")
193 # Description/remarks - increased limit for more context
194 remarks = property_data.get("public_remarks")
195 if remarks:
196 # Truncate long descriptions but keep more content
197 if len(remarks) > 1200:
198 remarks = remarks[:1200] + "..."
199 parts.append(remarks)
201 return " | ".join(parts)
204def index_property(session: Session, listing_id: str, text_content: str) -> None:
205 """Index a property for vector search using pgvector.
207 Args:
208 session: Database session
209 listing_id: MLS listing ID
210 text_content: Text representation of the property
211 """
212 # Generate embedding via Ollama API
213 embedding = generate_embedding_sync(text_content)
215 if embedding is None:
216 print(f"Failed to generate embedding for {listing_id}")
217 return
219 # Store embedding directly in properties table
220 session.execute(
221 text("""
222 UPDATE properties
223 SET embedding = :embedding::vector
224 WHERE listing_id = :listing_id
225 """),
226 {
227 "listing_id": listing_id,
228 "embedding": embedding_to_pgvector(embedding),
229 },
230 )
233def search_properties(
234 session: Session,
235 query: str,
236 limit: int = 10,
237) -> list[dict]:
238 """Search properties by semantic similarity using pgvector.
240 Args:
241 session: Database session
242 query: Search query (e.g., "3 bedroom with pool near downtown")
243 limit: Maximum results to return
245 Returns:
246 List of matching properties with similarity scores
247 """
248 # Generate query embedding
249 query_embedding = generate_embedding_sync(query)
251 if query_embedding is None:
252 return []
254 result = session.execute(
255 text("""
256 SELECT
257 p.listing_id,
258 p.street_number,
259 p.street_name,
260 p.street_suffix,
261 p.city,
262 p.state_or_province,
263 p.postal_code,
264 p.list_price,
265 p.bedrooms_total,
266 p.bathrooms_total_integer,
267 p.living_area,
268 p.property_type,
269 p.standard_status,
270 p.primary_photo_url,
271 p.embedding <=> :query_embedding::vector AS similarity_distance
272 FROM properties p
273 WHERE p.embedding IS NOT NULL
274 ORDER BY p.embedding <=> :query_embedding::vector
275 LIMIT :limit
276 """),
277 {
278 "query_embedding": embedding_to_pgvector(query_embedding),
279 "limit": limit,
280 },
281 )
283 return [
284 {
285 "listing_id": row.listing_id,
286 "address": " ".join(filter(None, [row.street_number, row.street_name, row.street_suffix])),
287 "city": row.city,
288 "state": row.state_or_province,
289 "postal_code": row.postal_code,
290 "price": row.list_price,
291 "beds": row.bedrooms_total,
292 "baths": row.bathrooms_total_integer,
293 "sqft": row.living_area,
294 "property_type": row.property_type,
295 "status": row.standard_status,
296 "photo_url": row.primary_photo_url,
297 "similarity": 1.0 - row.similarity_distance,
298 }
299 for row in result.fetchall()
300 ]
303def reindex_all_properties(session: Session, limit: int | None = None) -> int:
304 """Reindex all properties for vector search.
306 Args:
307 session: Database session
308 limit: Optional limit on number of properties to index (for testing)
310 Returns:
311 Number of properties indexed
312 """
313 query = """
314 SELECT *
315 FROM properties
316 WHERE mlg_can_view = true
317 AND standard_status = 'Active'
318 """
319 if limit:
320 query += f" LIMIT {limit}"
322 result = session.execute(text(query))
324 count = 0
325 for row in result.fetchall():
326 row_dict = row._asdict()
327 text_content = build_property_text(row_dict)
328 index_property(session, row_dict['listing_id'], text_content)
329 count += 1
330 # Commit periodically to avoid holding locks too long
331 if count % 50 == 0:
332 session.commit()
334 session.commit()
335 return count