migrate-lsw/scrape-ladsawai-table.js
2026-01-13 10:17:00 +07:00

118 lines
3.2 KiB
JavaScript

// ประกาศจัดซื้อจัดจ้างภาครัฐ (egp)
const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");
const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "ประกาศจัดซื้อจัดจ้างภาครัฐ (egp)"); // ประกาศจัดซื้อจัดจ้างภาครัฐ (egp)
fs.mkdirSync(OUT, { recursive: true });
function curlHtml(url) {
return execSync(
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
{ encoding: "utf8", maxBuffer: 20 * 1024 * 1024 }
);
}
function absUrl(src) {
if (!src) return null;
if (src.startsWith("http")) return src;
return BASE + src;
}
function scrapeOnePage(menuId, page, saveHtml = false) {
const url = `${BASE}/public/rss/egp/listegp/menu/${menuId}/page/${page}`;
const html = curlHtml(url);
if (saveHtml) {
fs.writeFileSync(
path.join(OUT, `page-egp-menu-${menuId}-page-${page}.html`),
html,
"utf8"
);
}
const $ = cheerio.load(html);
const items = [];
// ✅ ตารางรายการ
$("table tbody tr").each((_, tr) => {
const tds = $(tr).find("td");
if (tds.length < 3) return;
const date = $(tds[0]).text().replace(/\s+/g, " ").trim();
const category = $(tds[1]).text().replace(/\s+/g, " ").trim();
const a = $(tds[2]).find("a").first();
const title = a.text().replace(/\s+/g, " ").trim();
const link = absUrl(a.attr("href")); // หน้ารายละเอียด (ถ้ามี)
if (!title) return;
items.push({
title,
date: date || null,
category: category || null,
link: link || null,
sourcePage: page,
sourceUrl: url,
});
});
const output = {
source: url,
scrapedAt: new Date().toISOString(),
menuId,
page,
count: items.length,
items,
};
const outJson = path.join(OUT, `egp-menu-${menuId}-page-${page}.json`);
fs.writeFileSync(outJson, JSON.stringify(output, null, 2), "utf8");
console.log(`✅ EGP page ${page} -> items ${items.length}`);
return items;
}
(function main() {
const menuId = 1564; // ประกาศจัดซื้อจัดจ้างภาครัฐ (egp)
const totalPages = 240;
const all = [];
const seen = new Set();
// ถ้าไม่อยากให้มี HTML 53 ไฟล์ ให้เป็น false
const saveHtml = false;
for (let page = 1; page <= totalPages; page++) {
const items = scrapeOnePage(menuId, page, saveHtml);
// รวม + กันซ้ำ
for (const it of items) {
const key = `${it.title}|${it.date || ""}|${it.image || ""}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(it);
}
}
const merged = {
menuId,
totalPages,
scrapedAt: new Date().toISOString(),
totalItems: all.length,
items: all,
};
const outAll = path.join(OUT, `list-menu-${menuId}-all.json`);
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
console.log("✅ Saved merged JSON:", outAll);
console.log("✅ Total unique items:", all.length);
})();