Coverage for src / idx_api / embeddings / blog_posts.py: 15%
46 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:16 -0700
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-28 11:16 -0700
1"""Blog post embedding functions for semantic content search with PostgreSQL/pgvector."""
3from sqlalchemy import text
4from sqlalchemy.orm import Session
6from idx_api.embeddings.base import embedding_to_pgvector, generate_embedding_sync
9def build_blog_post_text(post_data: dict) -> str:
10 """Build rich text representation of a blog post for embedding.
12 Args:
13 post_data: Blog post data dict with title, description, content, tags, etc.
15 Returns:
16 Text representation for semantic search
17 """
18 parts = []
20 title = post_data.get("title")
21 if title:
22 parts.append(title)
24 description = post_data.get("description")
25 if description:
26 parts.append(description)
28 # Add category context
29 category = post_data.get("category")
30 if category:
31 parts.append(f"Category: {category}")
33 # Add tags
34 tags = post_data.get("tags", [])
35 if tags:
36 parts.append(f"Tags: {', '.join(tags)}")
38 # Add content (truncated)
39 content = post_data.get("content", "")
40 if content:
41 if len(content) > 1000:
42 content = content[:1000] + "..."
43 parts.append(content)
45 return " - ".join(parts)
48def index_blog_post(session: Session, post_slug: str, text_content: str) -> None:
49 """Index a blog post for vector search using pgvector.
51 Note: This assumes a blog_posts table with embedding column exists.
52 For now, we store embeddings in a simple key-value style table.
54 Args:
55 session: Database session
56 post_slug: Blog post slug/identifier
57 text_content: Text representation of the post
58 """
59 # Generate embedding via Ollama API
60 embedding = generate_embedding_sync(text_content)
62 if embedding is None:
63 print(f"Failed to generate embedding for blog post {post_slug}")
64 return
66 # Upsert into blog_post_embeddings table
67 session.execute(
68 text("""
69 INSERT INTO blog_post_embeddings (post_slug, embedding)
70 VALUES (:post_slug, :embedding::vector)
71 ON CONFLICT (post_slug) DO UPDATE SET
72 embedding = EXCLUDED.embedding
73 """),
74 {
75 "post_slug": post_slug,
76 "embedding": embedding_to_pgvector(embedding),
77 },
78 )
81def search_blog_posts(
82 session: Session,
83 query: str,
84 limit: int = 10,
85) -> list[dict]:
86 """Search blog posts by semantic similarity using pgvector.
88 Args:
89 session: Database session
90 query: Search query (e.g., "first time home buyer tips")
91 limit: Maximum results to return
93 Returns:
94 List of matching blog post slugs with similarity scores
95 """
96 # Generate query embedding
97 query_embedding = generate_embedding_sync(query)
99 if query_embedding is None:
100 return []
102 result = session.execute(
103 text("""
104 SELECT
105 post_slug,
106 embedding <=> :query_embedding::vector AS similarity_distance
107 FROM blog_post_embeddings
108 WHERE embedding IS NOT NULL
109 ORDER BY embedding <=> :query_embedding::vector
110 LIMIT :limit
111 """),
112 {
113 "query_embedding": embedding_to_pgvector(query_embedding),
114 "limit": limit,
115 },
116 )
118 return [
119 {
120 "slug": row.post_slug,
121 "similarity": 1.0 - row.similarity_distance,
122 }
123 for row in result.fetchall()
124 ]
127def index_blog_posts_from_data(session: Session, posts: list[dict]) -> int:
128 """Index multiple blog posts from provided data.
130 This is called from an admin endpoint that reads blog posts from
131 the frontend content directory.
133 Args:
134 session: Database session
135 posts: List of blog post dicts with slug, title, description, content, tags
137 Returns:
138 Number of posts indexed
139 """
140 count = 0
141 for post in posts:
142 slug = post.get("slug")
143 if not slug:
144 continue
145 text_content = build_blog_post_text(post)
146 index_blog_post(session, slug, text_content)
147 count += 1
149 session.commit()
150 return count