migrate-lsw/scrape-ladsawai-list.js
2026-01-13 10:17:00 +07:00

113 lines
2.5 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const { chromium } = require("playwright");
const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "ประกาศจัดซื้อจัดจ้าง");
fs.mkdirSync(OUT, { recursive: true });
function absUrl(src) {
if (!src) return null;
if (src.startsWith("http")) return src;
if (src.startsWith("/")) return BASE + src;
return BASE + "/" + src;
}
async function scrapePage(menuId, page) {
const url = `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
const browser = await chromium.launch({ headless: true });
const pageObj = await browser.newPage();
await pageObj.goto(url, { waitUntil: "networkidle", timeout: 60000 });
// รอให้ JS render row จริง
await pageObj.waitForSelector(".row.data-row", { timeout: 30000 });
const html = await pageObj.content();
await browser.close();
// debug HTML ที่ render แล้ว
fs.writeFileSync(
path.join(OUT, `debug-rendered-page-${page}.html`),
html,
"utf8"
);
const $ = cheerio.load(html);
const items = [];
$(".row.data-row").each((_, row) => {
const el = $(row);
const left = el.find(".col-12.col-sm-10").first();
// 👉 ตอนนี้ text จะมาแล้ว
const title = left
.clone()
.find("a,img")
.remove()
.end()
.text()
.replace(/\s+/g, " ")
.trim();
const a = left.find("a[href]").first();
const link = absUrl(a.attr("href"));
const date = el
.find(".col-12.col-sm-2")
.text()
.replace(/\s+/g, " ")
.trim();
if (!title) return;
items.push({
title,
date,
link,
sourcePage: page,
sourceUrl: url,
});
});
return items;
}
(async () => {
const menuId = 1236;
const totalPages = 12;
const all = [];
const seen = new Set();
for (let p = 1; p <= totalPages; p++) {
const items = await scrapePage(menuId, p);
console.log(`✅ page ${p} -> ${items.length} items`);
for (const it of items) {
const key = `${it.title}|${it.date}|${it.link}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(it);
}
}
const out = {
menuId,
totalItems: all.length,
scrapedAt: new Date().toISOString(),
items: all,
};
fs.writeFileSync(
path.join(OUT, `menu-${menuId}-all.json`),
JSON.stringify(out, null, 2),
"utf8"
);
console.log("🎉 DONE:", all.length);
})();