Coverage for src / idx_api / embeddings / blog_posts.py: 15%

46 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-28 11:16 -0700

1"""Blog post embedding functions for semantic content search with PostgreSQL/pgvector.""" 

2 

3from sqlalchemy import text 

4from sqlalchemy.orm import Session 

5 

6from idx_api.embeddings.base import embedding_to_pgvector, generate_embedding_sync 

7 

8 

9def build_blog_post_text(post_data: dict) -> str: 

10 """Build rich text representation of a blog post for embedding. 

11 

12 Args: 

13 post_data: Blog post data dict with title, description, content, tags, etc. 

14 

15 Returns: 

16 Text representation for semantic search 

17 """ 

18 parts = [] 

19 

20 title = post_data.get("title") 

21 if title: 

22 parts.append(title) 

23 

24 description = post_data.get("description") 

25 if description: 

26 parts.append(description) 

27 

28 # Add category context 

29 category = post_data.get("category") 

30 if category: 

31 parts.append(f"Category: {category}") 

32 

33 # Add tags 

34 tags = post_data.get("tags", []) 

35 if tags: 

36 parts.append(f"Tags: {', '.join(tags)}") 

37 

38 # Add content (truncated) 

39 content = post_data.get("content", "") 

40 if content: 

41 if len(content) > 1000: 

42 content = content[:1000] + "..." 

43 parts.append(content) 

44 

45 return " - ".join(parts) 

46 

47 

48def index_blog_post(session: Session, post_slug: str, text_content: str) -> None: 

49 """Index a blog post for vector search using pgvector. 

50 

51 Note: This assumes a blog_posts table with embedding column exists. 

52 For now, we store embeddings in a simple key-value style table. 

53 

54 Args: 

55 session: Database session 

56 post_slug: Blog post slug/identifier 

57 text_content: Text representation of the post 

58 """ 

59 # Generate embedding via Ollama API 

60 embedding = generate_embedding_sync(text_content) 

61 

62 if embedding is None: 

63 print(f"Failed to generate embedding for blog post {post_slug}") 

64 return 

65 

66 # Upsert into blog_post_embeddings table 

67 session.execute( 

68 text(""" 

69 INSERT INTO blog_post_embeddings (post_slug, embedding) 

70 VALUES (:post_slug, :embedding::vector) 

71 ON CONFLICT (post_slug) DO UPDATE SET 

72 embedding = EXCLUDED.embedding 

73 """), 

74 { 

75 "post_slug": post_slug, 

76 "embedding": embedding_to_pgvector(embedding), 

77 }, 

78 ) 

79 

80 

81def search_blog_posts( 

82 session: Session, 

83 query: str, 

84 limit: int = 10, 

85) -> list[dict]: 

86 """Search blog posts by semantic similarity using pgvector. 

87 

88 Args: 

89 session: Database session 

90 query: Search query (e.g., "first time home buyer tips") 

91 limit: Maximum results to return 

92 

93 Returns: 

94 List of matching blog post slugs with similarity scores 

95 """ 

96 # Generate query embedding 

97 query_embedding = generate_embedding_sync(query) 

98 

99 if query_embedding is None: 

100 return [] 

101 

102 result = session.execute( 

103 text(""" 

104 SELECT 

105 post_slug, 

106 embedding <=> :query_embedding::vector AS similarity_distance 

107 FROM blog_post_embeddings 

108 WHERE embedding IS NOT NULL 

109 ORDER BY embedding <=> :query_embedding::vector 

110 LIMIT :limit 

111 """), 

112 { 

113 "query_embedding": embedding_to_pgvector(query_embedding), 

114 "limit": limit, 

115 }, 

116 ) 

117 

118 return [ 

119 { 

120 "slug": row.post_slug, 

121 "similarity": 1.0 - row.similarity_distance, 

122 } 

123 for row in result.fetchall() 

124 ] 

125 

126 

127def index_blog_posts_from_data(session: Session, posts: list[dict]) -> int: 

128 """Index multiple blog posts from provided data. 

129 

130 This is called from an admin endpoint that reads blog posts from 

131 the frontend content directory. 

132 

133 Args: 

134 session: Database session 

135 posts: List of blog post dicts with slug, title, description, content, tags 

136 

137 Returns: 

138 Number of posts indexed 

139 """ 

140 count = 0 

141 for post in posts: 

142 slug = post.get("slug") 

143 if not slug: 

144 continue 

145 text_content = build_blog_post_text(post) 

146 index_blog_post(session, slug, text_content) 

147 count += 1 

148 

149 session.commit() 

150 return count