Coverage for src/idx_api/embeddings/blog

1"""Blog post embedding functions for semantic content search with PostgreSQL/pgvector."""

3from sqlalchemy import text

4from sqlalchemy.orm import Session

6from idx_api.embeddings.base import embedding_to_pgvector, generate_embedding_sync

9def build_blog_post_text(post_data: dict) -> str:

10 """Build rich text representation of a blog post for embedding.

12 Args:

13 post_data: Blog post data dict with title, description, content, tags, etc.

15 Returns:

16 Text representation for semantic search

17 """

18 parts = []

20 title = post_data.get("title")

21 if title:

22 parts.append(title)

24 description = post_data.get("description")

25 if description:

26 parts.append(description)

28 # Add category context

29 category = post_data.get("category")

30 if category:

31 parts.append(f"Category: {category}")

33 # Add tags

34 tags = post_data.get("tags", [])

35 if tags:

36 parts.append(f"Tags: {', '.join(tags)}")

38 # Add content (truncated)

39 content = post_data.get("content", "")

40 if content:

41 if len(content) > 1000:

42 content = content[:1000] + "..."

43 parts.append(content)

45 return " - ".join(parts)

48def index_blog_post(session: Session, post_slug: str, text_content: str) -> None:

49 """Index a blog post for vector search using pgvector.

51 Note: This assumes a blog_posts table with embedding column exists.

52 For now, we store embeddings in a simple key-value style table.

54 Args:

55 session: Database session

56 post_slug: Blog post slug/identifier

57 text_content: Text representation of the post

58 """

59 # Generate embedding via Ollama API

60 embedding = generate_embedding_sync(text_content)

62 if embedding is None:

63 print(f"Failed to generate embedding for blog post {post_slug}")

64 return

66 # Upsert into blog_post_embeddings table

67 session.execute(

68 text("""

69 INSERT INTO blog_post_embeddings (post_slug, embedding)

70 VALUES (:post_slug, :embedding::vector)

71 ON CONFLICT (post_slug) DO UPDATE SET

72 embedding = EXCLUDED.embedding

73 """),

74 {

75 "post_slug": post_slug,

76 "embedding": embedding_to_pgvector(embedding),

77 },

78 )

81def search_blog_posts(

82 session: Session,

83 query: str,

84 limit: int = 10,

85) -> list[dict]:

86 """Search blog posts by semantic similarity using pgvector.

88 Args:

89 session: Database session

90 query: Search query (e.g., "first time home buyer tips")

91 limit: Maximum results to return

93 Returns:

94 List of matching blog post slugs with similarity scores

95 """

96 # Generate query embedding

97 query_embedding = generate_embedding_sync(query)

99 if query_embedding is None:

100 return []

101

102 result = session.execute(

103 text("""

104 SELECT

105 post_slug,

106 embedding <=> :query_embedding::vector AS similarity_distance

107 FROM blog_post_embeddings

108 WHERE embedding IS NOT NULL

109 ORDER BY embedding <=> :query_embedding::vector

110 LIMIT :limit

111 """),

112 {

113 "query_embedding": embedding_to_pgvector(query_embedding),

114 "limit": limit,

115 },

116 )

117

118 return [

119 {

120 "slug": row.post_slug,

121 "similarity": 1.0 - row.similarity_distance,

122 }

123 for row in result.fetchall()

124 ]

125

126

127def index_blog_posts_from_data(session: Session, posts: list[dict]) -> int:

128 """Index multiple blog posts from provided data.

129

130 This is called from an admin endpoint that reads blog posts from

131 the frontend content directory.

132

133 Args:

134 session: Database session

135 posts: List of blog post dicts with slug, title, description, content, tags

136

137 Returns:

138 Number of posts indexed

139 """

140 count = 0

141 for post in posts:

142 slug = post.get("slug")

143 if not slug:

144 continue

145 text_content = build_blog_post_text(post)

146 index_blog_post(session, slug, text_content)

147 count += 1

148

149 session.commit()

150 return count

Coverage for src / idx_api / embeddings / blog_posts.py: 15%

46 statements