存储静态网站
本文章的的目的是编写一个程序,将静态完整的所有网页数据存入到 markdown 中.方便后续大模型推理
创建表
drop table if exists web_page_cache;
CREATE TABLE "public"."web_page_cache" (
"id" BIGINT NOT NULL PRIMARY KEY,
"url" VARCHAR UNIQUE,
"title" VARCHAR,
"type" VARCHAR,
"text" text,
"html" text,
"markdown" text,urlurl
"remark" VARCHAR(256),
"creator" VARCHAR(64) DEFAULT '',
"create_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updater" VARCHAR(64) DEFAULT '',
"update_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"deleted" SMALLINT DEFAULT 0,
"tenant_id" BIGINT NOT NULL DEFAULT 0
);
CREATE INDEX "web_page_cache_url" ON "web_page_cache" USING btree ("url" varchar_pattern_ops);
CREATE INDEX "web_page_cache_title" ON "web_page_cache" USING btree ("title" varchar_pattern_ops);
drop table if exists hawaii_web_page;
CREATE TABLE "public"."hawaii_web_page" (
"id" BIGINT NOT NULL PRIMARY KEY,
"url" VARCHAR UNIQUE,
"title" VARCHAR,
"type" VARCHAR,
"text" text,
"html" text,
"markdown" text,
"remark" VARCHAR(256),
"creator" VARCHAR(64) DEFAULT '',
"create_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updater" VARCHAR(64) DEFAULT '',
"update_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"deleted" SMALLINT DEFAULT 0,
"tenant_id" BIGINT NOT NULL DEFAULT 0
);
CREATE INDEX "hawaii_web_page_url" ON "hawaii_web_page" USING btree ("url" varchar_pattern_ops);
CREATE INDEX "hawaii_web_page_title" ON "hawaii_web_page" USING btree ("title" varchar_pattern_ops);
drop table if exists web_page_url;
CREATE TABLE "public"."web_page_url" (
"id" BIGINT NOT NULL PRIMARY KEY,
"url" VARCHAR UNIQUE,
"status" int default 0,
"tried" int default 0,
"remark" VARCHAR(256),
"creator" VARCHAR(64) DEFAULT '',
"create_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updater" VARCHAR(64) DEFAULT '',
"update_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"deleted" SMALLINT DEFAULT 0,
"tenant_id" BIGINT NOT NULL DEFAULT 0
);
CREATE INDEX "web_page_url_url" ON "web_page_url" USING btree ("url" varchar_pattern_ops);
-- status 0 添加到任务队列 1 爬取完成 2.如何判断爬取失败
drop table if exists hawaii_kapiolani_web_page;
CREATE TABLE "public"."hawaii_kapiolani_web_page" (
"id" BIGINT NOT NULL PRIMARY KEY,
"url" VARCHAR UNIQUE,
"title" VARCHAR,
"type" VARCHAR,
"text" text,
"html" text,
"markdown" text,
"remark" VARCHAR(256),
"creator" VARCHAR(64) DEFAULT '',
"create_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updater" VARCHAR(64) DEFAULT '',
"update_time" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
"deleted" SMALLINT DEFAULT 0,
"tenant_id" BIGINT NOT NULL DEFAULT 0
);