Coverage for src / idx_api / embeddings / broker_contacts.py: 15%

48 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-28 11:12 -0700

1"""Broker contact embedding functions for semantic person search with PostgreSQL/pgvector.""" 

2 

3from sqlalchemy import text 

4from sqlalchemy.orm import Session 

5 

6from idx_api.embeddings.base import embedding_to_pgvector, generate_embedding_sync 

7 

8 

9def build_broker_contact_text(contact) -> str: 

10 """Build rich text representation of a broker contact for embedding. 

11 

12 Args: 

13 contact: Broker (contact) ORM model or dict 

14 

15 Returns: 

16 Text representation for semantic search 

17 """ 

18 parts = [] 

19 

20 # Always include name 

21 name = contact.name if hasattr(contact, "name") else contact.get("name") 

22 if name: 

23 parts.append(name) 

24 

25 # Add email 

26 email = contact.email if hasattr(contact, "email") else contact.get("email") 

27 if email: 

28 parts.append(email) 

29 

30 # Add license info 

31 license_type = ( 

32 contact.license_type 

33 if hasattr(contact, "license_type") 

34 else contact.get("license_type") 

35 ) 

36 if license_type: 

37 parts.append(f"{license_type} license") 

38 

39 # Add primary contact indicator 

40 is_primary = ( 

41 contact.is_primary 

42 if hasattr(contact, "is_primary") 

43 else contact.get("is_primary") 

44 ) 

45 if is_primary: 

46 parts.append("primary contact") 

47 

48 # Add brokerage context if available 

49 brokerage_name = ( 

50 contact.brokerage.name 

51 if hasattr(contact, "brokerage") and contact.brokerage 

52 else contact.get("brokerage_name") 

53 ) 

54 if brokerage_name: 

55 parts.append(f"at {brokerage_name}") 

56 

57 return " - ".join(parts) 

58 

59 

60def index_broker_contact(session: Session, contact_id: int, text_content: str) -> None: 

61 """Index a broker contact for vector search using pgvector. 

62 

63 Args: 

64 session: Database session 

65 contact_id: ID of the broker contact to index 

66 text_content: Text representation of the contact 

67 """ 

68 # Generate embedding via Ollama API 

69 embedding = generate_embedding_sync(text_content) 

70 

71 if embedding is None: 

72 print(f"Failed to generate embedding for broker contact {contact_id}") 

73 return 

74 

75 # Store embedding directly in brokers table 

76 session.execute( 

77 text(""" 

78 UPDATE brokers 

79 SET embedding = :embedding::vector 

80 WHERE id = :contact_id 

81 """), 

82 { 

83 "contact_id": contact_id, 

84 "embedding": embedding_to_pgvector(embedding), 

85 }, 

86 ) 

87 

88 

89def search_broker_contacts( 

90 session: Session, 

91 query: str, 

92 brokerage_id: int | None = None, 

93 limit: int = 10, 

94) -> list[dict]: 

95 """Search broker contacts by semantic similarity using pgvector. 

96 

97 Args: 

98 session: Database session 

99 query: Search query (e.g., "primary contact" or "licensed broker") 

100 brokerage_id: Optional filter by brokerage ID 

101 limit: Maximum results to return 

102 

103 Returns: 

104 List of matching broker contacts with similarity scores 

105 """ 

106 # Generate query embedding 

107 query_embedding = generate_embedding_sync(query) 

108 

109 if query_embedding is None: 

110 return [] 

111 

112 # Build SQL based on whether we're filtering by brokerage 

113 if brokerage_id: 

114 sql = text(""" 

115 SELECT 

116 bc.id, 

117 bc.brokerage_id, 

118 bc.name, 

119 bc.email, 

120 bc.phone, 

121 bc.license_type, 

122 bc.is_primary, 

123 bc.embedding <=> :query_embedding::vector AS similarity_distance 

124 FROM brokers bc 

125 WHERE bc.embedding IS NOT NULL 

126 AND bc.brokerage_id = :brokerage_id 

127 ORDER BY bc.embedding <=> :query_embedding::vector 

128 LIMIT :limit 

129 """) 

130 params = { 

131 "query_embedding": embedding_to_pgvector(query_embedding), 

132 "limit": limit, 

133 "brokerage_id": brokerage_id, 

134 } 

135 else: 

136 sql = text(""" 

137 SELECT 

138 bc.id, 

139 bc.brokerage_id, 

140 bc.name, 

141 bc.email, 

142 bc.phone, 

143 bc.license_type, 

144 bc.is_primary, 

145 bc.embedding <=> :query_embedding::vector AS similarity_distance 

146 FROM brokers bc 

147 WHERE bc.embedding IS NOT NULL 

148 ORDER BY bc.embedding <=> :query_embedding::vector 

149 LIMIT :limit 

150 """) 

151 params = { 

152 "query_embedding": embedding_to_pgvector(query_embedding), 

153 "limit": limit, 

154 } 

155 

156 result = session.execute(sql, params) 

157 

158 return [ 

159 { 

160 "id": row.id, 

161 "brokerage_id": row.brokerage_id, 

162 "name": row.name, 

163 "email": row.email, 

164 "phone": row.phone, 

165 "license_type": row.license_type, 

166 "is_primary": row.is_primary, 

167 "similarity": 1.0 - row.similarity_distance, 

168 } 

169 for row in result.fetchall() 

170 ] 

171 

172 

173def reindex_all_broker_contacts(session: Session) -> int: 

174 """Reindex all broker contacts for vector search. 

175 

176 Returns: 

177 Number of broker contacts indexed 

178 """ 

179 result = session.execute( 

180 text(""" 

181 SELECT bc.*, b.name as brokerage_name 

182 FROM brokers bc 

183 LEFT JOIN brokerages b ON b.id = bc.brokerage_id 

184 WHERE bc.disabled_at IS NULL 

185 """) 

186 ) 

187 

188 count = 0 

189 for row in result.fetchall(): 

190 row_dict = row._asdict() 

191 text_content = build_broker_contact_text(row_dict) 

192 index_broker_contact(session, row_dict['id'], text_content) 

193 count += 1 

194 

195 session.commit() 

196 return count