Skip to content

Commit ba4768f

Browse files
committed
feat: switch to jsdom for DOM processing and improve database queries
This commit replaces happy-dom with jsdom for HTML parsing and manipulation due to memory issues encountered during processing with happy-dom. The following changes were made: - Updated `vitest.config.ts` to use the `jsdom` environment. - Replaced `happy-dom` with `jsdom` in `package.json` and `package-lock.json`. - Updated `HtmlProcessor.ts` and `SemanticMarkdownSplitter.ts` to use `jsdom` for DOM operations. - Added case-insensitive indexes to `library` and `version` columns in `init-db.sql`. - Updated database queries in `DocumentStore.ts` to use `LOWER()` for case-insensitive matching of `library` and `version`. - Updated `ScrapeTool.ts` to normalize the version using `semver`.
1 parent bc805fc commit ba4768f

File tree

8 files changed

+532
-53
lines changed

8 files changed

+532
-53
lines changed

init-db.sql

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ CREATE TABLE IF NOT EXISTS documents (
1717
sort_order BIGSERIAL
1818
);
1919

20-
-- Create indexes for improved search performance
21-
CREATE INDEX IF NOT EXISTS idx_documents_library ON documents(library);
22-
CREATE INDEX IF NOT EXISTS idx_documents_version ON documents(version);
20+
-- Create case-insensitive indexes for improved search performance
21+
CREATE INDEX IF NOT EXISTS idx_documents_library_lower ON documents(LOWER(library));
22+
CREATE INDEX IF NOT EXISTS idx_documents_version_lower ON documents(LOWER(version));
2323
CREATE INDEX IF NOT EXISTS idx_documents_content_search ON documents USING GIN(content_search);
2424
CREATE INDEX IF NOT EXISTS idx_documents_url_sort ON documents(url, library, version, sort_order);
2525

@@ -49,7 +49,7 @@ BEGIN
4949
END IF;
5050

5151
INSERT INTO documents (library, version, url, content, metadata, embedding, content_search)
52-
VALUES (p_library, p_version, p_url, p_content, p_metadata, p_embedding, to_tsvector('english',
52+
VALUES (LOWER(p_library), LOWER(p_version), p_url, p_content, p_metadata, p_embedding, to_tsvector('english',
5353
'<title>' || coalesce(p_metadata->>'title', '') || '</title>' ||
5454
'<url>' || coalesce(p_metadata->>'url', '') || '</url>' ||
5555
'<path>' || coalesce((SELECT string_agg(elem, ' / ') FROM jsonb_array_elements_text(p_metadata->'path') AS elem), '') || '</path>' ||
@@ -80,8 +80,8 @@ BEGIN
8080

8181
WITH deleted AS (
8282
DELETE FROM documents
83-
WHERE library = p_library
84-
AND version = p_version
83+
WHERE LOWER(library) = LOWER(p_library)
84+
AND LOWER(version) = LOWER(p_version)
8585
RETURNING 1
8686
)
8787
SELECT COUNT(*) INTO v_count FROM deleted;

0 commit comments

Comments
 (0)