211 lines
6.1 KiB
JavaScript
211 lines
6.1 KiB
JavaScript
import crypto from "crypto";
|
|
import OpenAI from "openai";
|
|
import { debug as dbg } from "./debug.js";
|
|
import { searchProducts } from "./wooProducts.js";
|
|
import {
|
|
searchProductAliases,
|
|
getProductEmbedding,
|
|
upsertProductEmbedding,
|
|
} from "../db/repo.js";
|
|
|
|
function getOpenAiKey() {
|
|
return process.env.OPENAI_API_KEY || process.env.OPENAI_APIKEY || null;
|
|
}
|
|
|
|
function getEmbeddingsModel() {
|
|
return process.env.OPENAI_EMBEDDINGS_MODEL || "text-embedding-3-small";
|
|
}
|
|
|
|
function normalizeText(s) {
|
|
return String(s || "")
|
|
.toLowerCase()
|
|
.replace(/[¿?¡!.,;:()"]/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function hashText(s) {
|
|
return crypto.createHash("sha256").update(String(s || "")).digest("hex");
|
|
}
|
|
|
|
function cosine(a, b) {
|
|
if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length || a.length === 0) return 0;
|
|
let dot = 0;
|
|
let na = 0;
|
|
let nb = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
const x = Number(a[i]) || 0;
|
|
const y = Number(b[i]) || 0;
|
|
dot += x * y;
|
|
na += x * x;
|
|
nb += y * y;
|
|
}
|
|
if (na === 0 || nb === 0) return 0;
|
|
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
|
}
|
|
|
|
function candidateText(c) {
|
|
const parts = [c?.name || ""];
|
|
if (Array.isArray(c?.categories)) {
|
|
for (const cat of c.categories) {
|
|
if (cat?.name) parts.push(cat.name);
|
|
if (cat?.slug) parts.push(cat.slug);
|
|
}
|
|
}
|
|
if (Array.isArray(c?.attributes)) {
|
|
for (const a of c.attributes) {
|
|
if (a?.name) parts.push(a.name);
|
|
if (Array.isArray(a?.options)) parts.push(a.options.join(" "));
|
|
}
|
|
}
|
|
return parts.join(" ");
|
|
}
|
|
|
|
function literalScore(query, candidate) {
|
|
const q = normalizeText(query);
|
|
const n = normalizeText(candidate?.name || "");
|
|
if (!q || !n) return 0;
|
|
if (n === q) return 1.0;
|
|
if (n.includes(q)) return 0.7;
|
|
const qt = new Set(q.split(" ").filter(Boolean));
|
|
const nt = new Set(n.split(" ").filter(Boolean));
|
|
let hits = 0;
|
|
for (const w of qt) if (nt.has(w)) hits++;
|
|
return hits / Math.max(qt.size, 1);
|
|
}
|
|
|
|
async function embedText({ tenantId, text }) {
|
|
const key = getOpenAiKey();
|
|
if (!key) return { embedding: null, cached: false, model: null, error: "OPENAI_NO_KEY" };
|
|
|
|
const content = normalizeText(text);
|
|
const contentHash = hashText(content);
|
|
const cached = await getProductEmbedding({ tenant_id: tenantId, content_hash: contentHash });
|
|
if (cached?.embedding) {
|
|
return { embedding: cached.embedding, cached: true, model: cached.model || null };
|
|
}
|
|
|
|
const client = new OpenAI({ apiKey: key });
|
|
const model = getEmbeddingsModel();
|
|
const resp = await client.embeddings.create({
|
|
model,
|
|
input: content,
|
|
});
|
|
const vector = resp?.data?.[0]?.embedding || null;
|
|
if (Array.isArray(vector)) {
|
|
await upsertProductEmbedding({
|
|
tenant_id: tenantId,
|
|
content_hash: contentHash,
|
|
content_text: content,
|
|
embedding: vector,
|
|
model,
|
|
});
|
|
}
|
|
return { embedding: vector, cached: false, model };
|
|
}
|
|
|
|
function mergeCandidates(list) {
|
|
const map = new Map();
|
|
for (const c of list) {
|
|
if (!c?.woo_product_id) continue;
|
|
const id = Number(c.woo_product_id);
|
|
if (!map.has(id)) {
|
|
map.set(id, { ...c });
|
|
} else {
|
|
const prev = map.get(id);
|
|
map.set(id, { ...prev, ...c, _score: Math.max(prev._score || 0, c._score || 0) });
|
|
}
|
|
}
|
|
return [...map.values()];
|
|
}
|
|
|
|
/**
|
|
* retrieveCandidates: combina Woo literal + alias + embeddings.
|
|
*/
|
|
export async function retrieveCandidates({
|
|
tenantId,
|
|
query,
|
|
attributes = [],
|
|
preparation = [],
|
|
limit = 12,
|
|
}) {
|
|
const lim = Math.max(1, Math.min(50, parseInt(limit, 10) || 12));
|
|
const q = String(query || "").trim();
|
|
if (!q) {
|
|
return { candidates: [], audit: { reason: "empty_query" } };
|
|
}
|
|
|
|
const audit = { query: q, sources: {}, boosts: {}, embeddings: {} };
|
|
|
|
const aliases = await searchProductAliases({ tenant_id: tenantId, q, limit: 20 });
|
|
const aliasBoostByProduct = new Map();
|
|
for (const a of aliases) {
|
|
if (a?.woo_product_id) {
|
|
const id = Number(a.woo_product_id);
|
|
const boost = Number(a.boost || 0);
|
|
aliasBoostByProduct.set(id, Math.max(aliasBoostByProduct.get(id) || 0, boost || 0));
|
|
}
|
|
}
|
|
audit.sources.aliases = aliases.length;
|
|
|
|
const { items: wooItems, source: wooSource } = await searchProducts({
|
|
tenantId,
|
|
q,
|
|
limit: lim,
|
|
forceWoo: true,
|
|
});
|
|
audit.sources.woo = { source: wooSource, count: wooItems?.length || 0 };
|
|
|
|
let candidates = (wooItems || []).map((c) => {
|
|
const lit = literalScore(q, c);
|
|
const boost = aliasBoostByProduct.get(Number(c.woo_product_id)) || 0;
|
|
return { ...c, _score: lit + boost, _score_detail: { literal: lit, alias_boost: boost } };
|
|
});
|
|
|
|
// embeddings: opcional, si hay key y tenemos candidatos
|
|
if (candidates.length) {
|
|
try {
|
|
const queryEmb = await embedText({ tenantId, text: q });
|
|
if (Array.isArray(queryEmb.embedding)) {
|
|
audit.embeddings.query = { cached: queryEmb.cached, model: queryEmb.model };
|
|
const enriched = [];
|
|
for (const c of candidates.slice(0, 25)) {
|
|
const text = candidateText(c);
|
|
const emb = await embedText({ tenantId, text });
|
|
const cos = Array.isArray(emb.embedding) ? cosine(queryEmb.embedding, emb.embedding) : 0;
|
|
const prev = c._score || 0;
|
|
enriched.push({
|
|
...c,
|
|
_score: prev + Math.max(0, cos),
|
|
_score_detail: { ...(c._score_detail || {}), cosine: cos, emb_cached: emb.cached },
|
|
});
|
|
}
|
|
// merge con el resto sin embeddings
|
|
const tail = candidates.slice(25);
|
|
candidates = mergeCandidates([...enriched, ...tail]);
|
|
} else {
|
|
audit.embeddings.query = { error: queryEmb.error || "no_embedding" };
|
|
}
|
|
} catch (e) {
|
|
audit.embeddings.error = String(e?.message || e);
|
|
}
|
|
}
|
|
|
|
candidates.sort((a, b) => (b._score || 0) - (a._score || 0));
|
|
const finalList = candidates.slice(0, lim);
|
|
|
|
if (dbg.resolve) {
|
|
console.log("[catalogRetrieval] candidates", {
|
|
query: q,
|
|
top: finalList.slice(0, 5).map((c) => ({
|
|
id: c.woo_product_id,
|
|
name: c.name,
|
|
score: c._score,
|
|
detail: c._score_detail,
|
|
})),
|
|
});
|
|
}
|
|
|
|
return { candidates: finalList, audit };
|
|
}
|