Coverage for src/idx_api/routers/search.py: 8%

1"""Semantic search endpoints using native sqlite-vec embeddings.

3Uses sqlite-vec with rembed() for high-performance vector search directly in SQLite.

4The MATCH operator leverages native vector indexing for fast similarity queries.

5"""

7import time

9from fastapi import APIRouter, Depends, HTTPException

10from sqlalchemy import text

11from sqlalchemy.orm import Session

13from idx_api.database import get_db

14from idx_api.embeddings import ensure_embedding_client

15from idx_api.schemas.property import (

16 PropertyListItem,

17 PropertySearchResponse,

18 SemanticSearchParams,

19)

21router = APIRouter()

24@router.post("/semantic", response_model=PropertySearchResponse)

25async def semantic_search(

26 params: SemanticSearchParams,

27 db: Session = Depends(get_db),

28):

29 """

30 Search properties using natural language with native vector search.

32 Uses sqlite-vec with rembed() for fast, native vector similarity search.

33 This is significantly faster than Python-based similarity calculations

34 as the embedding generation and similarity matching happen in SQLite.

36 Examples:

37 - "modern kitchen with granite countertops"

38 - "family home near good schools"

39 - "cozy cottage with large backyard"

40 - "single story ranch in Ada County"

41 - "new construction with central air"

42 """

43 start_time = time.perf_counter()

45 try:

46 # Ensure embedding client is registered for this session

47 ensure_embedding_client(db)

48 except Exception as e:

49 raise HTTPException(

50 status_code=503,

51 detail=f"Embedding client initialization failed: {str(e)}",

52 )

54 # Build additional filter conditions for property table

55 extra_conditions = []

56 bind_params = {"query": params.query, "k_limit": params.limit * 3} # Fetch more for filtering

58 if params.city:

59 extra_conditions.append("p.city = :city")

60 bind_params["city"] = params.city

62 if params.min_price:

63 extra_conditions.append("p.list_price >= :min_price")

64 bind_params["min_price"] = params.min_price

66 if params.max_price:

67 extra_conditions.append("p.list_price <= :max_price")

68 bind_params["max_price"] = params.max_price

70 if params.min_beds:

71 extra_conditions.append("p.bedrooms_total >= :min_beds")

72 bind_params["min_beds"] = params.min_beds

74 # Build the extra WHERE clause for property filters

75 extra_where = ""

76 if extra_conditions:

77 extra_where = " AND " + " AND ".join(extra_conditions)

79 try:

80 # Native sqlite-vec search using MATCH with rembed()

81 # The embedding is generated on-the-fly and matched against indexed vectors

82 results = db.execute(

83 text(f"""

84 SELECT

85 p.id,

86 p.listing_id,

87 p.street_number,

88 p.street_name,

89 p.street_suffix,

90 p.unit_number,

91 p.city,

92 p.state_or_province,

93 p.postal_code,

94 p.county_or_parish,

95 p.list_price,

96 p.bedrooms_total,

97 p.bathrooms_total_integer,

98 p.bathrooms_full,

99 p.bathrooms_half,

100 p.living_area,

101 p.lot_size_area,

102 p.year_built,

103 p.stories,

104 p.garage_spaces,

105 p.property_type,

106 p.property_sub_type,

107 p.standard_status,

108 p.primary_photo_url,

109 p.photo_count,

110 v.distance as similarity_distance

111 FROM vec_properties v

112 JOIN properties p ON p.listing_id = v.listing_id

113 WHERE v.embedding MATCH rembed('ollama', :query)

114 AND k = :k_limit

115 AND p.mlg_can_view = 1

116 AND p.standard_status = 'Active'

117 {extra_where}

118 ORDER BY v.distance

119 LIMIT :result_limit

120 """),

121 {**bind_params, "result_limit": params.limit},

122 ).mappings().all()

123

124 except Exception as e:

125 raise HTTPException(

126 status_code=500,

127 detail=f"Vector search failed: {str(e)}",

128 )

129

130 if not results:

131 return PropertySearchResponse(

132 items=[],

133 total=0,

134 page=1,

135 page_size=params.limit,

136 total_pages=0,

137 query_time_ms=round((time.perf_counter() - start_time) * 1000, 2),

138 search_type="semantic",

139 )

140

141 items = []

142 for row in results:

143 row_dict = dict(row)

144 # Convert distance to similarity score (lower distance = higher similarity)

145 # For normalized vectors, distance is in range [0, 2], so similarity = 1 - (distance/2)

146 distance = row_dict.pop("similarity_distance", 0)

147 similarity = 1.0 - (distance / 2.0) if distance else 1.0

148

149 item = PropertyListItem.model_validate(row_dict)

150 item.similarity_score = round(similarity, 4)

151 items.append(item)

152

153 query_time = (time.perf_counter() - start_time) * 1000

154

155 return PropertySearchResponse(

156 items=items,

157 total=len(items),

158 page=1,

159 page_size=params.limit,

160 total_pages=1,

161 query_time_ms=round(query_time, 2),

162 search_type="semantic",

163 )

164

165

166@router.post("/multimodal", response_model=PropertySearchResponse)

167async def multimodal_search(

168 params: SemanticSearchParams,

169 db: Session = Depends(get_db),

170):

171 """

172 Multi-modal search combining text, image descriptions, and visual embeddings.

173

174 Fuses results from three search approaches:

175 1. **Text search**: Property text embeddings (address, specs, remarks)

176 2. **Image descriptions (Approach A)**: AI-generated photo descriptions

177 3. **Visual search (Approach B)**: Direct SigLIP image embeddings (when enabled)

178

179 Results are combined using weighted scoring:

180 - 40% text similarity

181 - 30% best matching image description

182 - 30% best matching visual embedding (if available)

183

184 This approach finds properties where visual features in photos match

185 the query even if the listing text doesn't mention them.

186

187 Examples:

188 - "granite countertops" → finds kitchens with visible granite

189 - "mountain views" → finds properties showing mountain views in photos

190 - "modern farmhouse style" → matches both text and visual style cues

191 """

192 start_time = time.perf_counter()

193

194 from idx_api.config import settings

195 from idx_api.embeddings import search_image_descriptions, search_image_visuals

196 from idx_api.vision import get_siglip_text_embedding

197

198 try:

199 ensure_embedding_client(db)

200 except Exception as e:

201 raise HTTPException(

202 status_code=503,

203 detail=f"Embedding client initialization failed: {str(e)}",

204 )

205

206 # Weights for score fusion (configurable via settings)

207 TEXT_WEIGHT = settings.search_weight_text

208 IMAGE_DESC_WEIGHT = settings.search_weight_descriptions

209 IMAGE_VISUAL_WEIGHT = settings.search_weight_visual

210

211 # 1. Text search (existing property embeddings)

212 try:

213 text_results = db.execute(

214 text("""

215 SELECT

216 v.listing_id,

217 v.distance as text_distance

218 FROM vec_properties v

219 JOIN properties p ON p.listing_id = v.listing_id

220 WHERE v.embedding MATCH rembed('ollama', :query)

221 AND k = :limit

222 AND p.mlg_can_view = 1

223 AND p.standard_status = 'Active'

224 ORDER BY v.distance

225 """),

226 {"query": params.query, "limit": params.limit * 5},

227 ).mappings().all()

228 text_scores = {

229 row["listing_id"]: 1.0 - (row["text_distance"] / 2.0)

230 for row in text_results

231 }

232 except Exception:

233 text_scores = {}

234

235 # 2. Image description search (Approach A)

236 try:

237 image_desc_results = search_image_descriptions(

238 db, params.query, limit=params.limit * 5

239 )

240 # Group by listing_id and take best (highest) similarity per listing

241 image_desc_scores: dict[str, float] = {}

242 for img in image_desc_results:

243 listing_id = img["listing_id"]

244 sim = img["similarity"]

245 if listing_id not in image_desc_scores or sim > image_desc_scores[listing_id]:

246 image_desc_scores[listing_id] = sim

247 except Exception:

248 image_desc_scores = {}

249

250 # 3. Visual search (Approach B) - if SigLIP enabled

251 image_visual_scores: dict[str, float] = {}

252 if settings.siglip_enabled:

253 try:

254 query_embedding = await get_siglip_text_embedding(params.query)

255 if query_embedding:

256 visual_results = search_image_visuals(

257 db, query_embedding, limit=params.limit * 5

258 )

259 for img in visual_results:

260 listing_id = img["listing_id"]

261 sim = img["similarity"]

262 if listing_id not in image_visual_scores or sim > image_visual_scores[listing_id]:

263 image_visual_scores[listing_id] = sim

264 except Exception:

265 pass

266

267 # Combine all listing IDs from any search result

268 all_listing_ids = set(text_scores.keys()) | set(image_desc_scores.keys()) | set(image_visual_scores.keys())

269

270 if not all_listing_ids:

271 return PropertySearchResponse(

272 items=[],

273 total=0,

274 page=1,

275 page_size=params.limit,

276 total_pages=0,

277 query_time_ms=round((time.perf_counter() - start_time) * 1000, 2),

278 search_type="multimodal",

279 )

280

281 # Calculate combined scores for each property

282 scored_listings = []

283 for listing_id in all_listing_ids:

284 # Get individual scores (default to 0 if not in results)

285 text_sim = text_scores.get(listing_id, 0.0)

286 desc_sim = image_desc_scores.get(listing_id, 0.0)

287 visual_sim = image_visual_scores.get(listing_id, 0.0)

288

289 # Adjust weights if visual embeddings not available

290 if settings.siglip_enabled and image_visual_scores:

291 # Full multi-modal fusion

292 combined_score = (

293 TEXT_WEIGHT * text_sim +

294 IMAGE_DESC_WEIGHT * desc_sim +

295 IMAGE_VISUAL_WEIGHT * visual_sim

296 )

297 else:

298 # Fall back to text + description only (re-normalize weights)

299 combined_score = (

300 (TEXT_WEIGHT / (TEXT_WEIGHT + IMAGE_DESC_WEIGHT)) * text_sim +

301 (IMAGE_DESC_WEIGHT / (TEXT_WEIGHT + IMAGE_DESC_WEIGHT)) * desc_sim

302 )

303

304 scored_listings.append((combined_score, listing_id, {

305 "text": round(text_sim, 4),

306 "description": round(desc_sim, 4),

307 "visual": round(visual_sim, 4),

308 }))

309

310 # Sort by combined score descending

311 scored_listings.sort(key=lambda x: x[0], reverse=True)

312 top_listings = scored_listings[:params.limit]

313

314 # Fetch full property details for top results

315 if top_listings:

316 listing_ids = [lid for _, lid, _ in top_listings]

317 placeholders = ", ".join([f":lid{i}" for i in range(len(listing_ids))])

318 params_dict = {f"lid{i}": lid for i, lid in enumerate(listing_ids)}

319

320 property_data = db.execute(

321 text(f"""

322 SELECT

323 id, listing_id, street_number, street_name, street_suffix,

324 unit_number, city, state_or_province, postal_code,

325 county_or_parish, list_price, bedrooms_total,

326 bathrooms_total_integer, bathrooms_full, bathrooms_half,

327 living_area, lot_size_area, year_built, stories,

328 garage_spaces, property_type, property_sub_type,

329 standard_status, primary_photo_url, photo_count

330 FROM properties

331 WHERE listing_id IN ({placeholders})

332 """),

333 params_dict,

334 ).mappings().all()

335

336 # Create lookup by listing_id

337 property_lookup = {row["listing_id"]: dict(row) for row in property_data}

338 else:

339 property_lookup = {}

340

341 items = []

342 for combined_score, listing_id, component_scores in top_listings:

343 if listing_id in property_lookup:

344 row_dict = property_lookup[listing_id]

345 item = PropertyListItem.model_validate(row_dict)

346 item.similarity_score = round(combined_score, 4)

347 # Add component scores as extra info (available via model_dump)

348 items.append(item)

349

350 query_time = (time.perf_counter() - start_time) * 1000

351

352 return PropertySearchResponse(

353 items=items,

354 total=len(scored_listings),

355 page=1,

356 page_size=params.limit,

357 total_pages=max(1, len(scored_listings) // params.limit),

358 query_time_ms=round(query_time, 2),

359 search_type="multimodal",

360 )

361

362

363@router.post("/compare")

364async def compare_search_methods(

365 params: SemanticSearchParams,

366 db: Session = Depends(get_db),

367):

368 """

369 A/B comparison of search methods.

370

371 Runs the same query against all three search approaches and returns

372 results side-by-side for quality evaluation:

373 - Text: Property text embeddings (address, specs, remarks)

374 - Descriptions: AI-generated photo descriptions (Approach A)

375 - Visual: SigLIP image embeddings (Approach B)

376 - Fused: Combined weighted results

377

378 Use this to evaluate which approach finds the most relevant results

379 for different query types.

380 """

381 import time

382 from idx_api.config import settings

383 from idx_api.embeddings import search_image_descriptions, search_image_visuals

384 from idx_api.vision import get_siglip_text_embedding

385

386 start_time = time.perf_counter()

387

388 try:

389 ensure_embedding_client(db)

390 except Exception as e:

391 raise HTTPException(

392 status_code=503,

393 detail=f"Embedding client initialization failed: {str(e)}",

394 )

395

396 results = {

397 "query": params.query,

398 "methods": {},

399 "analysis": {},

400 }

401

402 # 1. Text search (property embeddings)

403 text_start = time.perf_counter()

404 try:

405 text_results = db.execute(

406 text("""

407 SELECT

408 p.listing_id,

409 p.city,

410 p.state_or_province,

411 p.list_price,

412 p.bedrooms_total,

413 p.bathrooms_total_integer,

414 p.living_area,

415 p.primary_photo_url,

416 v.distance as similarity_distance

417 FROM vec_properties v

418 JOIN properties p ON p.listing_id = v.listing_id

419 WHERE v.embedding MATCH rembed('ollama', :query)

420 AND k = :limit

421 AND p.mlg_can_view = 1

422 AND p.standard_status = 'Active'

423 ORDER BY v.distance

424 """),

425 {"query": params.query, "limit": params.limit},

426 ).mappings().all()

427

428 results["methods"]["text"] = {

429 "time_ms": round((time.perf_counter() - text_start) * 1000, 2),

430 "count": len(text_results),

431 "items": [

432 {

433 "listing_id": r["listing_id"],

434 "location": f"{r['city']}, {r['state_or_province']}",

435 "price": r["list_price"],

436 "specs": f"{r['bedrooms_total']}bd/{r['bathrooms_total_integer']}ba",

437 "similarity": round(1.0 - r["similarity_distance"] / 2.0, 4),

438 }

439 for r in text_results

440 ],

441 }

442 except Exception as e:

443 results["methods"]["text"] = {"error": str(e), "count": 0, "items": []}

444

445 # 2. Image description search (Approach A)

446 desc_start = time.perf_counter()

447 try:

448 desc_results = search_image_descriptions(db, params.query, limit=params.limit * 3)

449 # Group by listing_id and take best match per property

450 best_by_listing = {}

451 for r in desc_results:

452 lid = r["listing_id"]

453 if lid not in best_by_listing or r["similarity"] > best_by_listing[lid]["similarity"]:

454 best_by_listing[lid] = r

455

456 sorted_desc = sorted(best_by_listing.values(), key=lambda x: x["similarity"], reverse=True)[:params.limit]

457

458 results["methods"]["descriptions"] = {

459 "time_ms": round((time.perf_counter() - desc_start) * 1000, 2),

460 "count": len(sorted_desc),

461 "items": [

462 {

463 "listing_id": r["listing_id"],

464 "location": f"{r['city']}, {r['state']}" if r.get('city') else "Unknown",

465 "price": r.get("price"),

466 "room_type": r.get("room_type"),

467 "description_snippet": (r.get("description") or "")[:100] + "...",

468 "similarity": round(r["similarity"], 4),

469 }

470 for r in sorted_desc

471 ],

472 }

473 except Exception as e:

474 results["methods"]["descriptions"] = {"error": str(e), "count": 0, "items": []}

475

476 # 3. Visual search (Approach B - SigLIP)

477 visual_start = time.perf_counter()

478 if settings.siglip_enabled:

479 try:

480 query_embedding = await get_siglip_text_embedding(params.query)

481 if query_embedding:

482 visual_results = search_image_visuals(db, query_embedding, limit=params.limit * 3)

483 # Group by listing_id

484 best_by_listing = {}

485 for r in visual_results:

486 lid = r["listing_id"]

487 if lid not in best_by_listing or r["similarity"] > best_by_listing[lid]["similarity"]:

488 best_by_listing[lid] = r

489

490 sorted_visual = sorted(best_by_listing.values(), key=lambda x: x["similarity"], reverse=True)[:params.limit]

491

492 results["methods"]["visual"] = {

493 "time_ms": round((time.perf_counter() - visual_start) * 1000, 2),

494 "count": len(sorted_visual),

495 "items": [

496 {

497 "listing_id": r["listing_id"],

498 "location": f"{r['city']}, {r['state']}" if r.get('city') else "Unknown",

499 "price": r.get("price"),

500 "room_type": r.get("room_type"),

501 "similarity": round(r["similarity"], 4),

502 }

503 for r in sorted_visual

504 ],

505 }

506 else:

507 results["methods"]["visual"] = {"error": "Failed to get query embedding", "count": 0, "items": []}

508 except Exception as e:

509 results["methods"]["visual"] = {"error": str(e), "count": 0, "items": []}

510 else:

511 results["methods"]["visual"] = {"error": "SigLIP not enabled", "count": 0, "items": []}

512

513 # Analysis: Compare rankings

514 text_ids = [r["listing_id"] for r in results["methods"].get("text", {}).get("items", [])]

515 desc_ids = [r["listing_id"] for r in results["methods"].get("descriptions", {}).get("items", [])]

516 visual_ids = [r["listing_id"] for r in results["methods"].get("visual", {}).get("items", [])]

517

518 # Find properties that appear in multiple methods

519 all_ids = set(text_ids) | set(desc_ids) | set(visual_ids)

520 overlap_analysis = []

521 for lid in all_ids:

522 sources = []

523 if lid in text_ids:

524 sources.append(f"text(#{text_ids.index(lid)+1})")

525 if lid in desc_ids:

526 sources.append(f"desc(#{desc_ids.index(lid)+1})")

527 if lid in visual_ids:

528 sources.append(f"visual(#{visual_ids.index(lid)+1})")

529 if len(sources) > 1:

530 overlap_analysis.append({"listing_id": lid, "found_in": sources})

531

532 results["analysis"] = {

533 "total_time_ms": round((time.perf_counter() - start_time) * 1000, 2),

534 "unique_properties": len(all_ids),

535 "text_only": len(set(text_ids) - set(desc_ids) - set(visual_ids)),

536 "desc_only": len(set(desc_ids) - set(text_ids) - set(visual_ids)),

537 "visual_only": len(set(visual_ids) - set(text_ids) - set(desc_ids)),

538 "multi_source_matches": overlap_analysis[:10], # Top 10 overlaps

539 }

540

541 return results

542

543

544@router.post("/similar-images")

545async def find_similar_images(

546 image_url: str,

547 limit: int = 10,

548 db: Session = Depends(get_db),

549):

550 """

551 Visual similarity search: Find properties with visually similar photos.

552

553 Upload/provide an image URL (e.g., a kitchen photo from a listing the user loves)

554 and find other properties with visually similar spaces.

555

556 This uses SigLIP's image→image matching, which is much more powerful than

557 text→image for finding visual similarity (same cabinet style, similar layouts, etc.)

558 """

559 import time

560 from idx_api.config import settings

561 from idx_api.embeddings import search_image_visuals

562 from idx_api.vision import get_visual_embedding

563

564 if not settings.siglip_enabled:

565 raise HTTPException(status_code=503, detail="Visual search not enabled")

566

567 start_time = time.perf_counter()

568

569 # Get embedding for the reference image

570 try:

571 query_embedding = await get_visual_embedding(image_url)

572 if not query_embedding:

573 raise HTTPException(status_code=400, detail="Failed to embed image")

574 except Exception as e:

575 raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")

576

577 # Search for visually similar images

578 try:

579 results = search_image_visuals(db, query_embedding, limit=limit * 2)

580 except Exception as e:

581 raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")

582

583 # Group by listing_id and return best match per property

584 seen_listings = set()

585 unique_results = []

586 for r in results:

587 if r["listing_id"] not in seen_listings:

588 seen_listings.add(r["listing_id"])

589 unique_results.append({

590 "listing_id": r["listing_id"],

591 "image_url": r["image_url"],

592 "room_type": r["room_type"],

593 "similarity": r["similarity"],

594 "city": r["city"],

595 "state": r["state"],

596 "price": r["price"],

597 })

598 if len(unique_results) >= limit:

599 break

600

601 return {

602 "query_image": image_url,

603 "results": unique_results,

604 "count": len(unique_results),

605 "query_time_ms": round((time.perf_counter() - start_time) * 1000, 2),

606 }

607

608

609@router.get("/autocomplete")

610async def autocomplete(

611 q: str,

612 limit: int = 10,

613 db: Session = Depends(get_db),

614):

615 """

616 Autocomplete for property search.

617

618 Returns matching cities, addresses, and postal codes.

619 """

620 if len(q) < 2:

621 return {"suggestions": []}

622

623 search_term = f"%{q}%"

624

625 # Search cities

626 cities = db.execute(

627 text("""

628 SELECT DISTINCT city, state_or_province, COUNT(*) as count

629 FROM properties

630 WHERE city LIKE :term AND mlg_can_view = 1

631 GROUP BY city, state_or_province

632 ORDER BY count DESC

633 LIMIT :limit

634 """),

635 {"term": search_term, "limit": limit},

636 ).mappings().all()

637

638 # Search postal codes

639 postal_codes = db.execute(

640 text("""

641 SELECT DISTINCT postal_code, city, state_or_province

642 FROM properties

643 WHERE postal_code LIKE :term AND mlg_can_view = 1

644 LIMIT :limit

645 """),

646 {"term": search_term, "limit": limit},

647 ).mappings().all()

648

649 suggestions = []

650

651 for city in cities:

652 suggestions.append({

653 "type": "city",

654 "text": f"{city['city']}, {city['state_or_province']}",

655 "value": {"city": city["city"], "state": city["state_or_province"]},

656 })

657

658 for postal in postal_codes:

659 suggestions.append({

660 "type": "postal_code",

661 "text": f"{postal['postal_code']} ({postal['city']}, {postal['state_or_province']})",

662 "value": {"postal_code": postal["postal_code"]},

663 })

664

665 return {"suggestions": suggestions[:limit]}

666

667

668@router.get("/inverse-footnotes/{listing_id}")

669async def get_inverse_footnotes_for_listing(

670 listing_id: str,

671 db: Session = Depends(get_db),

672):

673 """

674 Generate inverse footnotes linking listing text to photos.

675

676 For a given listing, scans its public remarks for terms that match

677 extracted image features, and returns a mapping of terms to photos.

678

679 This enables UI like:

680 > "This home features a **spacious shop**<sup>📷3</sup> and **soaking tub**<sup>📷5</sup>"

681

682 Where clicking the photo markers shows the actual image evidence.

683

684 Returns:

685 - terms: Dict mapping feature terms found in text → list of image indices

686 - images: Dict mapping image_id → image metadata (URL, room type, etc.)

687 - annotated_text: The listing text with [[term:image_ids]] markers

688 - total_terms_found: Number of linked terms

689 - total_images_linked: Number of images with links

690 """

691 from idx_api.embeddings import get_inverse_footnotes

692

693 # Get the listing's public remarks

694 result = db.execute(

695 text("""

696 SELECT listing_id, public_remarks

697 FROM properties

698 WHERE listing_id = :listing_id

699 AND mlg_can_view = 1

700 """),

701 {"listing_id": listing_id},

702 ).mappings().first()

703

704 if not result:

705 raise HTTPException(status_code=404, detail="Listing not found")

706

707 public_remarks = result.get("public_remarks") or ""

708 if not public_remarks:

709 return {

710 "listing_id": listing_id,

711 "terms": {},

712 "images": {},

713 "annotated_text": "",

714 "total_terms_found": 0,

715 "total_images_linked": 0,

716 "note": "Listing has no public remarks text",

717 }

718

719 # Generate inverse footnotes

720 footnotes = get_inverse_footnotes(db, listing_id, public_remarks)

721

722 return {

723 "listing_id": listing_id,

724 **footnotes,

725 }

726

727

728@router.post("/analyze-image")

729async def analyze_single_image(

730 image_url: str,

731 db: Session = Depends(get_db),

732):

733 """

734 Analyze a single property image and extract structured tags.

735

736 Uses Qwen3-VL to generate a structured analysis including:

737 - room_type: kitchen, bathroom, bedroom, etc.

738 - features: island, pantry, fireplace, pool, etc.

739 - materials: granite, hardwood, tile, etc.

740 - style: modern, traditional, farmhouse, etc.

741 - condition: new, updated, original, dated

742 - highlights: buyer-exciting features

743 - quality_score: 1-5 rating

744

745 This is useful for testing the vision analysis before bulk indexing.

746 """

747 from idx_api.vision import analyze_image_structured

748

749 try:

750 tags = await analyze_image_structured(image_url)

751 if not tags:

752 raise HTTPException(status_code=500, detail="Image analysis failed")

753

754 return {

755 "image_url": image_url,

756 "tags": tags.model_dump(),

757 }

758 except Exception as e:

759 raise HTTPException(status_code=500, detail=f"Analysis error: {str(e)}")

760

761

762@router.post("/index-images-full/{listing_id}")

763async def index_images_full(

764 listing_id: str,

765 max_images: int = 5,

766 db: Session = Depends(get_db),

767):

768 """

769 Full image indexing: descriptions, embeddings, AND structured tags.

770

771 For each image in a listing:

772 1. Generate buyer-focused description (Qwen3-VL)

773 2. Create description embedding for semantic search (mxbai-embed-large)

774 3. Extract structured tags for inverse footnotes (room_type, features, materials)

775

776 This is the complete pipeline that enables both:

777 - Semantic image search ("granite countertops" finds kitchens with granite)

778 - Inverse footnotes (linking "shop" in listing text to shop photos)

779 """

780 import json

781 from idx_api.embeddings import index_property_image, index_image_feature_tags

782 from idx_api.vision import describe_image, analyze_image_structured, extract_room_type

783

784 # Get photos for this listing

785 prop_result = db.execute(

786 text("""

787 SELECT listing_id, photos, primary_photo_url

788 FROM properties

789 WHERE listing_id = :listing_id AND mlg_can_view = 1

790 """),

791 {"listing_id": listing_id},

792 ).mappings().first()

793

794 if not prop_result:

795 raise HTTPException(status_code=404, detail="Listing not found")

796

797 photos = []

798 if prop_result.get("photos"):

799 try:

800 photos = json.loads(prop_result["photos"])[:max_images]

801 except json.JSONDecodeError:

802 pass

803 elif prop_result.get("primary_photo_url"):

804 photos = [prop_result["primary_photo_url"]]

805

806 if not photos:

807 raise HTTPException(status_code=404, detail="No photos found for listing")

808

809 results = {

810 "listing_id": listing_id,

811 "images_processed": 0,

812 "descriptions_generated": 0,

813 "embeddings_created": 0,

814 "tags_indexed": 0,

815 "images": [],

816 }

817

818 for idx, photo_url in enumerate(photos):

819 image_result = {

820 "url": photo_url,

821 "index": idx,

822 "description": None,

823 "room_type": None,

824 "tags": None,

825 }

826

827 # Step 1: Generate description for semantic search

828 description = await describe_image(photo_url)

829 if description:

830 room_type = extract_room_type(description)

831 image_result["description"] = description[:100] + "..." if len(description) > 100 else description

832 image_result["room_type"] = room_type

833 results["descriptions_generated"] += 1

834

835 # Step 2: Store description and create embedding

836 try:

837 image_id = index_property_image(

838 db, listing_id, photo_url, idx, description, room_type

839 )

840 results["embeddings_created"] += 1

841

842 # Step 3: Get structured tags for inverse footnotes

843 tags = await analyze_image_structured(photo_url)

844 if tags:

845 tag_dict = {

846 "room_type": [tags.room_type],

847 "features": tags.features,

848 "materials": tags.materials,

849 "style": [tags.style] if tags.style else [],

850 "condition": [tags.condition] if tags.condition else [],

851 "highlights": tags.highlights,

852 }

853 tag_count = index_image_feature_tags(db, image_id, tag_dict)

854 results["tags_indexed"] += tag_count

855 image_result["tags"] = {

856 "room_type": tags.room_type,

857 "features": tags.features[:3] if tags.features else [],

858 "tag_count": tag_count,

859 }

860

861 except Exception as e:

862 image_result["error"] = str(e)

863

864 results["images_processed"] += 1

865 results["images"].append(image_result)

866

867 db.commit()

868 return results

869

870

871@router.post("/index-images-batch")

872async def index_images_batch(

873 limit: int = 5,

874 max_images_per_listing: int = 5,

875 db: Session = Depends(get_db),

876):

877 """

878 Batch index images for multiple listings.

879

880 Finds unindexed properties and processes them with full indexing

881 (descriptions, embeddings, and structured tags).

882

883 Use this to gradually build up the image index.

884 """

885 import json

886 from idx_api.embeddings import (

887 get_unindexed_property_images,

888 index_property_image,

889 index_image_feature_tags,

890 )

891 from idx_api.vision import describe_image, analyze_image_structured, extract_room_type

892

893 # Get properties that haven't been indexed yet

894 unindexed = get_unindexed_property_images(db, limit=limit)

895

896 if not unindexed:

897 return {

898 "message": "All properties already indexed",

899 "properties_processed": 0,

900 }

901

902 results = {

903 "properties_processed": 0,

904 "images_processed": 0,

905 "descriptions_generated": 0,

906 "embeddings_created": 0,

907 "tags_indexed": 0,

908 "properties": [],

909 }

910

911 for prop in unindexed:

912 listing_id = prop["listing_id"]

913 photos = prop["photos"][:max_images_per_listing]

914

915 prop_result = {

916 "listing_id": listing_id,

917 "images": len(photos),

918 "indexed": 0,

919 "tags": 0,

920 }

921

922 for idx, photo_url in enumerate(photos):

923 # Generate description

924 description = await describe_image(photo_url)

925 if not description:

926 continue

927

928 room_type = extract_room_type(description)

929 results["descriptions_generated"] += 1

930

931 # Store and create embedding

932 try:

933 image_id = index_property_image(

934 db, listing_id, photo_url, idx, description, room_type

935 )

936 results["embeddings_created"] += 1

937 prop_result["indexed"] += 1

938

939 # Get structured tags

940 tags = await analyze_image_structured(photo_url)

941 if tags:

942 tag_dict = {

943 "room_type": [tags.room_type],

944 "features": tags.features,

945 "materials": tags.materials,

946 "style": [tags.style] if tags.style else [],

947 "condition": [tags.condition] if tags.condition else [],

948 "highlights": tags.highlights,

949 }

950 tag_count = index_image_feature_tags(db, image_id, tag_dict)

951 results["tags_indexed"] += tag_count

952 prop_result["tags"] += tag_count

953

954 except Exception as e:

955 prop_result["error"] = str(e)

956

957 results["images_processed"] += 1

958

959 # Commit after each property to avoid long transactions

960 db.commit()

961 results["properties_processed"] += 1

962 results["properties"].append(prop_result)

963

964 return results

965

966

967@router.post("/index-image-tags/{listing_id}")

968async def index_images_with_tags(

969 listing_id: str,

970 max_images: int = 5,

971 db: Session = Depends(get_db),

972):

973 """

974 Index property images with structured feature tags only.

975

976 Use /index-images-full/{listing_id} for complete indexing including

977 description embeddings. This endpoint only adds structured tags

978 to already-indexed images.

979 """

980 import json

981 from idx_api.embeddings import index_image_feature_tags

982 from idx_api.vision import analyze_image_structured

983

984 # Get the listing's existing images

985 result = db.execute(

986 text("""

987 SELECT id, image_url, image_index

988 FROM property_images

989 WHERE listing_id = :listing_id

990 ORDER BY image_index

991 LIMIT :max_images

992 """),

993 {"listing_id": listing_id, "max_images": max_images},

994 ).mappings().all()

995

996 if not result:

997 # No images indexed yet - get from property and analyze

998 prop_result = db.execute(

999 text("""

1000 SELECT photos, primary_photo_url

1001 FROM properties

1002 WHERE listing_id = :listing_id AND mlg_can_view = 1

1003 """),

1004 {"listing_id": listing_id},

1005 ).mappings().first()

1006

1007 if not prop_result:

1008 raise HTTPException(status_code=404, detail="Listing not found")

1009

1010 photos = []

1011 if prop_result.get("photos"):

1012 try:

1013 photos = json.loads(prop_result["photos"])[:max_images]

1014 except json.JSONDecodeError:

1015 pass

1016 elif prop_result.get("primary_photo_url"):

1017 photos = [prop_result["primary_photo_url"]]

1018

1019 if not photos:

1020 raise HTTPException(status_code=404, detail="No photos found for listing")

1021

1022 # Analyze and index each image

1023 indexed_count = 0

1024 tag_count = 0

1025 for idx, photo_url in enumerate(photos):

1026 tags = await analyze_image_structured(photo_url)

1027 if not tags:

1028 continue

1029

1030 # Store image metadata first (if not already indexed)

1031 db.execute(

1032 text("""

1033 INSERT INTO property_images (listing_id, image_url, image_index, description, room_type)

1034 VALUES (:listing_id, :image_url, :image_index, :description, :room_type)

1035 ON CONFLICT(listing_id, image_url) DO UPDATE SET

1036 room_type = excluded.room_type

1037 """),

1038 {

1039 "listing_id": listing_id,

1040 "image_url": photo_url,

1041 "image_index": idx,

1042 "description": "", # Can be filled by regular indexing

1043 "room_type": tags.room_type,

1044 },

1045 )

1046

1047 # Get the image ID

1048 img_result = db.execute(

1049 text("SELECT id FROM property_images WHERE listing_id = :lid AND image_url = :url"),

1050 {"lid": listing_id, "url": photo_url},

1051 ).fetchone()

1052

1053 if img_result:

1054 # Index the structured tags

1055 tag_dict = {

1056 "room_type": [tags.room_type],

1057 "features": tags.features,

1058 "materials": tags.materials,

1059 "style": [tags.style] if tags.style else [],

1060 "condition": [tags.condition] if tags.condition else [],

1061 "highlights": tags.highlights,

1062 }

1063 count = index_image_feature_tags(db, img_result[0], tag_dict)

1064 tag_count += count

1065 indexed_count += 1

1066

1067 db.commit()

1068

1069 return {

1070 "listing_id": listing_id,

1071 "images_processed": indexed_count,

1072 "tags_indexed": tag_count,

1073 "status": "success",

1074 }

1075

1076 # Images already exist - analyze and add tags

1077 indexed_count = 0

1078 tag_count = 0

1079

1080 for row in result:

1081 tags = await analyze_image_structured(row["image_url"])

1082 if not tags:

1083 continue

1084

1085 # Update room type

1086 db.execute(

1087 text("UPDATE property_images SET room_type = :room_type WHERE id = :id"),

1088 {"id": row["id"], "room_type": tags.room_type},

1089 )

1090

1091 # Index feature tags

1092 tag_dict = {

1093 "room_type": [tags.room_type],

1094 "features": tags.features,

1095 "materials": tags.materials,

1096 "style": [tags.style] if tags.style else [],

1097 "condition": [tags.condition] if tags.condition else [],

1098 "highlights": tags.highlights,

1099 }

1100 count = index_image_feature_tags(db, row["id"], tag_dict)

1101 tag_count += count

1102 indexed_count += 1

1103

1104 db.commit()

1105

1106 return {

1107 "listing_id": listing_id,

1108 "images_processed": indexed_count,

1109 "tags_indexed": tag_count,

1110 "status": "success",

1111 }

1112

1113

1114@router.get("/image-index-stats")

1115async def get_image_index_stats(db: Session = Depends(get_db)):

1116 """

1117 Get statistics about the image indexing progress.

1118

1119 Returns counts of indexed images, feature tags, and remaining properties.

1120 """

1121 # Total properties with photos

1122 total_with_photos = db.execute(text("""

1123 SELECT COUNT(*) FROM properties

1124 WHERE photos IS NOT NULL

1125 AND photos != '[]'

1126 AND mlg_can_view = 1

1127 """)).scalar()

1128

1129 # Indexed listings

1130 indexed_listings = db.execute(text("""

1131 SELECT COUNT(DISTINCT listing_id) FROM property_images

1132 """)).scalar()

1133

1134 # Total images

1135 total_images = db.execute(text("""

1136 SELECT COUNT(*) FROM property_images

1137 """)).scalar()

1138

1139 # Images with descriptions

1140 with_descriptions = db.execute(text("""

1141 SELECT COUNT(*) FROM property_images WHERE description IS NOT NULL

1142 """)).scalar()

1143

1144 # Description embeddings

1145 desc_embeddings = db.execute(text("""

1146 SELECT COUNT(*) FROM vec_image_descriptions

1147 """)).scalar()

1148

1149 # Feature tags

1150 feature_tags = db.execute(text("""

1151 SELECT COUNT(*) FROM image_feature_tags

1152 """)).scalar()

1153

1154 # Tags per type

1155 tags_by_type = db.execute(text("""

1156 SELECT feature_type, COUNT(*) as count

1157 FROM image_feature_tags

1158 GROUP BY feature_type

1159 ORDER BY count DESC

1160 """)).mappings().all()

1161

1162 # Room type distribution

1163 room_types = db.execute(text("""

1164 SELECT room_type, COUNT(*) as count

1165 FROM property_images

1166 WHERE room_type IS NOT NULL

1167 GROUP BY room_type

1168 ORDER BY count DESC

1169 """)).mappings().all()

1170

1171 return {

1172 "properties": {

1173 "total_with_photos": total_with_photos,

1174 "indexed": indexed_listings,

1175 "remaining": total_with_photos - indexed_listings,

1176 "percent_complete": round(indexed_listings / total_with_photos * 100, 1) if total_with_photos > 0 else 0,

1177 },

1178 "images": {

1179 "total": total_images,

1180 "with_descriptions": with_descriptions,

1181 "embeddings": desc_embeddings,

1182 },

1183 "feature_tags": {

1184 "total": feature_tags,

1185 "by_type": [dict(t) for t in tags_by_type],

1186 },

1187 "room_types": [dict(r) for r in room_types],

1188 }

Coverage for src / idx_api / routers / search.py: 8%

382 statements