from hyrex import HyrexRegistryimport psycopg2import openaiimport oshy = HyrexRegistry()@hy.taskdef update_db_record(doc_id: str, embedding: list[float]): """Store document embedding in PostgreSQL""" with psycopg2.connect(os.environ.get("DB_CONN_STRING")) as conn: cursor = conn.cursor() cursor.execute( "UPDATE documents SET embedding = %s WHERE doc_id = %s", (embedding, doc_id) )@hy.taskdef process_document(doc_id: str): """Generate embedding for a single document""" # Fetch document content from Google Drive file_content = gdrive_sdk.get_document(doc_id) # Generate embedding using OpenAI response = openai.embeddings.create( model="text-embedding-3-small", input=file_content ) embedding = response.data[0].embedding # Store embedding in database update_db_record.send(doc_id, embedding)@hy.taskdef sync_google_drive_documents(): """Process all documents in Google Drive""" all_doc_ids = gdrive_sdk.list_document_ids() for doc_id in all_doc_ids: process_document.send(doc_id)
# Process a single documentcurl -X POST http://localhost:8000/sync/document \ -H "Content-Type: application/json" \ -d '{"doc_id": "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms"}'# Process all documents curl -X POST http://localhost:8000/sync/all-documents \ -H "Content-Type: application/json" \ -d '{}'
Once embeddings are generated, perform semantic search:
-- Find documents similar to a querySELECT doc_id, title, 1 - (embedding <=> query_embedding) as similarityFROM documents ORDER BY embedding <=> query_embeddingLIMIT 10;