Coverage for src / idx_api / embeddings / images.py: 14%
104 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:16 -0700
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:16 -0700
1"""Property image embedding functions for multi-modal search with PostgreSQL/pgvector.
3Handles image indexing, description embeddings, visual embeddings,
4and inverse footnote linking.
5"""
7import json
8import re
10from sqlalchemy import text
11from sqlalchemy.orm import Session
13from idx_api.config import settings
14from idx_api.embeddings.base import embedding_to_pgvector, generate_embedding_sync
17def index_property_image(
18 session: Session,
19 listing_id: str,
20 image_url: str,
21 image_index: int,
22 description: str,
23 room_type: str | None = None,
24) -> int:
25 """Index a single property image with its description.
27 Args:
28 session: Database session
29 listing_id: MLS listing ID
30 image_url: URL of the image
31 image_index: Position in the photo array (0 = primary)
32 description: AI-generated description of the image
33 room_type: Detected room type (kitchen, bathroom, etc.)
35 Returns:
36 ID of the inserted/updated property_images row
37 """
38 # Generate description embedding via Ollama API
39 description_embedding = generate_embedding_sync(description) if description else None
41 # Insert or update the image metadata with embedding
42 session.execute(
43 text("""
44 INSERT INTO property_images (listing_id, image_url, image_index, description, room_type, description_embedding)
45 VALUES (:listing_id, :image_url, :image_index, :description, :room_type, :embedding::vector)
46 ON CONFLICT(listing_id, image_url) DO UPDATE SET
47 description = EXCLUDED.description,
48 room_type = EXCLUDED.room_type,
49 description_embedding = EXCLUDED.description_embedding,
50 created_at = CURRENT_TIMESTAMP
51 """),
52 {
53 "listing_id": listing_id,
54 "image_url": image_url,
55 "image_index": image_index,
56 "description": description,
57 "room_type": room_type,
58 "embedding": embedding_to_pgvector(description_embedding) if description_embedding else None,
59 },
60 )
62 # Get the image ID
63 result = session.execute(
64 text("SELECT id FROM property_images WHERE listing_id = :listing_id AND image_url = :image_url"),
65 {"listing_id": listing_id, "image_url": image_url},
66 )
67 row = result.fetchone()
68 return row[0]
71def index_property_image_visual(
72 session: Session,
73 image_id: int,
74 visual_embedding: list[float],
75) -> None:
76 """Store a SigLIP visual embedding for a property image (Approach B).
78 Args:
79 session: Database session
80 image_id: ID from property_images table
81 visual_embedding: 1024-dimensional SigLIP embedding
82 """
83 session.execute(
84 text("""
85 UPDATE property_images
86 SET visual_embedding = :embedding::vector
87 WHERE id = :image_id
88 """),
89 {
90 "image_id": image_id,
91 "embedding": embedding_to_pgvector(visual_embedding),
92 },
93 )
96def search_image_descriptions(
97 session: Session,
98 query: str,
99 limit: int = 20,
100) -> list[dict]:
101 """Search property images by description similarity (Approach A).
103 Args:
104 session: Database session
105 query: Search query (e.g., "granite countertops")
106 limit: Maximum results to return
108 Returns:
109 List of matching images with their properties
110 """
111 # Generate query embedding
112 query_embedding = generate_embedding_sync(query)
114 if query_embedding is None:
115 return []
117 result = session.execute(
118 text("""
119 SELECT
120 i.id as image_id,
121 i.listing_id,
122 i.image_url,
123 i.image_index,
124 i.description,
125 i.room_type,
126 p.city,
127 p.state_or_province,
128 p.list_price,
129 p.bedrooms_total,
130 p.bathrooms_total_integer,
131 i.description_embedding <=> :query_embedding::vector AS similarity_distance
132 FROM property_images i
133 JOIN properties p ON p.listing_id = i.listing_id
134 WHERE i.description_embedding IS NOT NULL
135 AND p.mlg_can_view = true
136 ORDER BY i.description_embedding <=> :query_embedding::vector
137 LIMIT :limit
138 """),
139 {
140 "query_embedding": embedding_to_pgvector(query_embedding),
141 "limit": limit,
142 },
143 )
145 return [
146 {
147 "image_id": row.image_id,
148 "listing_id": row.listing_id,
149 "image_url": row.image_url,
150 "image_index": row.image_index,
151 "description": row.description,
152 "room_type": row.room_type,
153 "city": row.city,
154 "state": row.state_or_province,
155 "price": row.list_price,
156 "beds": row.bedrooms_total,
157 "baths": row.bathrooms_total_integer,
158 "similarity": 1.0 - row.similarity_distance,
159 }
160 for row in result.fetchall()
161 ]
164def search_image_visuals(
165 session: Session,
166 query_embedding: list[float],
167 limit: int = 20,
168) -> list[dict]:
169 """Search property images by visual similarity (Approach B).
171 Args:
172 session: Database session
173 query_embedding: SigLIP text embedding for the query
174 limit: Maximum results to return
176 Returns:
177 List of matching images with their properties
178 """
179 result = session.execute(
180 text("""
181 SELECT
182 i.id as image_id,
183 i.listing_id,
184 i.image_url,
185 i.image_index,
186 i.description,
187 i.room_type,
188 p.city,
189 p.state_or_province,
190 p.list_price,
191 p.bedrooms_total,
192 p.bathrooms_total_integer,
193 i.visual_embedding <=> :query_embedding::vector AS similarity_distance
194 FROM property_images i
195 JOIN properties p ON p.listing_id = i.listing_id
196 WHERE i.visual_embedding IS NOT NULL
197 AND p.mlg_can_view = true
198 ORDER BY i.visual_embedding <=> :query_embedding::vector
199 LIMIT :limit
200 """),
201 {
202 "query_embedding": embedding_to_pgvector(query_embedding),
203 "limit": limit,
204 },
205 )
207 return [
208 {
209 "image_id": row.image_id,
210 "listing_id": row.listing_id,
211 "image_url": row.image_url,
212 "image_index": row.image_index,
213 "description": row.description,
214 "room_type": row.room_type,
215 "city": row.city,
216 "state": row.state_or_province,
217 "price": row.list_price,
218 "beds": row.bedrooms_total,
219 "baths": row.bathrooms_total_integer,
220 "similarity": 1.0 - row.similarity_distance,
221 }
222 for row in result.fetchall()
223 ]
226def get_unindexed_property_images(
227 session: Session,
228 limit: int = 100,
229) -> list[dict]:
230 """Get property images that haven't been indexed yet.
232 Returns properties with photos but no entries in property_images table.
234 Args:
235 session: Database session
236 limit: Maximum properties to return
238 Returns:
239 List of dicts with listing_id and photos array
240 """
241 result = session.execute(
242 text("""
243 SELECT p.listing_id, p.photos, p.primary_photo_url
244 FROM properties p
245 LEFT JOIN property_images pi ON pi.listing_id = p.listing_id
246 WHERE p.mlg_can_view = true
247 AND p.standard_status = 'Active'
248 AND p.photos IS NOT NULL
249 AND pi.id IS NULL
250 LIMIT :limit
251 """),
252 {"limit": limit},
253 )
255 properties = []
256 for row in result.fetchall():
257 photos = []
258 if row.photos:
259 try:
260 photos = json.loads(row.photos)
261 except json.JSONDecodeError:
262 pass
263 elif row.primary_photo_url:
264 photos = [row.primary_photo_url]
266 if photos:
267 properties.append({
268 "listing_id": row.listing_id,
269 "photos": photos[:settings.vision_max_photos_per_property],
270 })
272 return properties
275def get_image_index_stats(session: Session) -> dict:
276 """Get statistics about image indexing progress.
278 Returns:
279 Dict with counts for total properties, indexed images, etc.
280 """
281 result = session.execute(
282 text("""
283 SELECT
284 (SELECT COUNT(*) FROM properties WHERE mlg_can_view = true AND standard_status = 'Active') as total_properties,
285 (SELECT COUNT(DISTINCT listing_id) FROM property_images) as properties_with_images,
286 (SELECT COUNT(*) FROM property_images) as total_images,
287 (SELECT COUNT(*) FROM property_images WHERE description_embedding IS NOT NULL) as description_embeddings,
288 (SELECT COUNT(*) FROM property_images WHERE visual_embedding IS NOT NULL) as visual_embeddings
289 """)
290 )
291 row = result.fetchone()
292 return {
293 "total_properties": row[0],
294 "properties_with_images": row[1],
295 "total_images": row[2],
296 "description_embeddings": row[3],
297 "visual_embeddings": row[4],
298 }
301# ========================================
302# Feature Tags (for inverse footnotes)
303# ========================================
306def index_image_feature_tags(
307 session: Session,
308 image_id: int,
309 tags: dict,
310) -> int:
311 """Index structured feature tags for an image.
313 These tags enable "inverse footnote" linking from listing text to photos.
315 Args:
316 session: Database session
317 image_id: ID from property_images table
318 tags: Dict with feature_type → list of values, e.g.:
319 {"room_type": ["kitchen"], "features": ["island", "granite countertops"],
320 "materials": ["granite", "stainless steel"], "style": ["modern"]}
322 Returns:
323 Number of tags indexed
324 """
325 count = 0
326 for feature_type, values in tags.items():
327 if not values:
328 continue
329 # Handle both single values and lists
330 if isinstance(values, str):
331 values = [values]
333 for value in values:
334 if not value:
335 continue
336 # Normalize the value (lowercase, strip whitespace)
337 normalized_value = value.lower().strip()
338 try:
339 session.execute(
340 text("""
341 INSERT INTO image_feature_tags (image_id, feature_type, feature_value)
342 VALUES (:image_id, :feature_type, :feature_value)
343 ON CONFLICT (image_id, feature_type, feature_value) DO NOTHING
344 """),
345 {
346 "image_id": image_id,
347 "feature_type": feature_type,
348 "feature_value": normalized_value,
349 },
350 )
351 count += 1
352 except Exception:
353 pass # Ignore duplicate constraint violations
355 return count
358def find_images_by_feature(
359 session: Session,
360 listing_id: str,
361 feature_terms: list[str],
362) -> list[dict]:
363 """Find images for a listing that match given feature terms.
365 This is the core of "inverse footnotes" - given terms from listing text,
366 find which photos show those features.
368 Args:
369 session: Database session
370 listing_id: MLS listing ID to search within
371 feature_terms: List of terms to search for (e.g., ["shop", "tub", "pool"])
373 Returns:
374 List of matching images with their matching terms
375 """
376 if not feature_terms:
377 return []
379 # Build a query that finds images matching any of the terms
380 # Uses LIKE for partial matching (e.g., "shop" matches "workshop")
381 conditions = []
382 params = {"listing_id": listing_id}
384 for i, term in enumerate(feature_terms):
385 normalized = term.lower().strip()
386 param_name = f"term_{i}"
387 params[param_name] = f"%{normalized}%"
388 conditions.append(f"ft.feature_value LIKE :{param_name}")
390 if not conditions:
391 return []
393 where_clause = " OR ".join(conditions)
395 result = session.execute(
396 text(f"""
397 SELECT DISTINCT
398 i.id as image_id,
399 i.image_url,
400 i.image_index,
401 i.description,
402 i.room_type,
403 STRING_AGG(DISTINCT ft.feature_value, ',') as matched_features
404 FROM property_images i
405 JOIN image_feature_tags ft ON ft.image_id = i.id
406 WHERE i.listing_id = :listing_id
407 AND ({where_clause})
408 GROUP BY i.id, i.image_url, i.image_index, i.description, i.room_type
409 ORDER BY i.image_index
410 """),
411 params,
412 )
414 return [
415 {
416 "image_id": row.image_id,
417 "image_url": row.image_url,
418 "image_index": row.image_index,
419 "description": row.description,
420 "room_type": row.room_type,
421 "matched_features": row.matched_features.split(",") if row.matched_features else [],
422 }
423 for row in result.fetchall()
424 ]
427def get_inverse_footnotes(
428 session: Session,
429 listing_id: str,
430 listing_text: str,
431 min_term_length: int = 3,
432) -> dict:
433 """Generate inverse footnotes linking terms in listing text to photos.
435 Scans the listing text for known feature terms and returns a mapping
436 of terms to their corresponding image evidence.
438 Args:
439 session: Database session
440 listing_id: MLS listing ID
441 listing_text: The public remarks or description text
442 min_term_length: Minimum term length to consider (filters noise)
444 Returns:
445 Dict with:
446 - "terms": Dict mapping term → list of image_ids
447 - "images": Dict mapping image_id → image metadata
448 - "annotated_text": Original text with [[term:image_ids]] markers
449 """
450 # Get all feature tags for this listing's images
451 result = session.execute(
452 text("""
453 SELECT
454 i.id as image_id,
455 i.image_url,
456 i.image_index,
457 i.room_type,
458 ft.feature_type,
459 ft.feature_value
460 FROM property_images i
461 JOIN image_feature_tags ft ON ft.image_id = i.id
462 WHERE i.listing_id = :listing_id
463 """),
464 {"listing_id": listing_id},
465 )
467 # Build a mapping of feature terms to images
468 term_to_images = {} # term → set of image_ids
469 image_metadata = {} # image_id → image info
471 for row in result.fetchall():
472 term = row.feature_value
473 image_id = row.image_id
475 if term not in term_to_images:
476 term_to_images[term] = set()
477 term_to_images[term].add(image_id)
479 if image_id not in image_metadata:
480 image_metadata[image_id] = {
481 "image_id": image_id,
482 "image_url": row.image_url,
483 "image_index": row.image_index,
484 "room_type": row.room_type,
485 }
487 # Find terms that appear in the listing text
488 listing_lower = listing_text.lower()
489 found_terms = {} # term → list of image_ids (sorted)
491 for term, image_ids in term_to_images.items():
492 if len(term) < min_term_length:
493 continue
495 # Check if term appears in text (word boundary aware for longer terms)
496 if len(term) >= 4:
497 # Use word boundary for longer terms
498 pattern = r'\b' + re.escape(term) + r'\b'
499 if re.search(pattern, listing_lower):
500 found_terms[term] = sorted(image_ids)
501 elif term in listing_lower:
502 # For short terms, require exact substring
503 found_terms[term] = sorted(image_ids)
505 # Generate annotated text with inverse footnote markers
506 annotated = listing_text
507 for term in sorted(found_terms.keys(), key=len, reverse=True):
508 # Find all occurrences (case-insensitive)
509 image_ids = found_terms[term]
510 marker = f"[[{term}:{','.join(map(str, image_ids))}]]"
512 # Replace first occurrence only to avoid over-annotation
513 pattern = re.compile(re.escape(term), re.IGNORECASE)
514 annotated = pattern.sub(f"{term}{marker}", annotated, count=1)
516 return {
517 "terms": found_terms,
518 "images": image_metadata,
519 "annotated_text": annotated,
520 "total_terms_found": len(found_terms),
521 "total_images_linked": len(set().union(*found_terms.values())) if found_terms else 0,
522 }