migrate-lsw/scrape-ladsawai-book.js
2026-01-13 10:17:00 +07:00

136 lines
4.1 KiB
JavaScript

// หนังสือราชการ กรมส่งเสริมการปกครองท้องถิ่น
const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");
const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "หนังสือราชการจากท้องถิ่นจังหวัด");
fs.mkdirSync(OUT, { recursive: true });
function curlHtml(url) {
return execSync(
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
);
}
function absUrl(src) {
if (!src) return null;
if (src.startsWith("http")) return src;
if (src.startsWith("/")) return BASE + src;
return BASE + "/" + src;
}
// ✅ TODO: ใส่ path ให้ตรงของจริง (ดูจาก address bar)
// ตัวอย่างสมมติ:
// return `${BASE}/public/dispatch/index/menu/XXXX/page/${page}`;
function buildUrl(page) {
const menuId = 1243; // << เปลี่ยนให้ตรงเมนู
return `${BASE}/public/dispatch/data/index/menu/${menuId}/page/${page}`;
}
function scrapeOnePage(page, saveHtml = false) {
const url = buildUrl(page);
const html = curlHtml(url);
if (saveHtml) {
fs.writeFileSync(path.join(OUT, `debug-page-${page}.html`), html, "utf8");
}
const $ = cheerio.load(html);
const items = [];
const rows = $("table.dispatch_table tbody tr.dispatch_odd, table.dispatch_table tbody tr.dispatch_even");
console.log(`page ${page} rows =`, rows.length);
rows.each((_, tr) => {
const tds = $(tr).find("td.dispatch_normal");
if (tds.length < 4) return;
const date = $(tds[0]).text().replace(/\s+/g, " ").trim();
const no = $(tds[1]).text().replace(/\s+/g, " ").trim();
const topicTd = $(tds[2]);
const a = topicTd.find("a[href]").first();
const title = (a.text() || topicTd.text()).replace(/\s+/g, " ").trim();
const link = absUrl(a.attr("href"));
// บางแถวมีไอคอน pdf/ไฟล์
const fileLinks = [];
topicTd.find("a[href], img[src]").each((_, el) => {
const tag = el.tagName?.toLowerCase();
if (tag === "a") {
const href = $(el).attr("href");
if (href) fileLinks.push(absUrl(href));
} else if (tag === "img") {
const src = $(el).attr("src");
if (src) fileLinks.push(absUrl(src));
}
});
const type = $(tds[3]).text().replace(/\s+/g, " ").trim();
if (!title) return;
items.push({
date: date || null,
no: no || null,
title,
type: type || null,
link: link || null,
fileLinks: [...new Set(fileLinks)].filter(Boolean),
sourcePage: page,
sourceUrl: url,
});
});
const output = {
source: url,
scrapedAt: new Date().toISOString(),
page,
count: items.length,
items,
};
fs.writeFileSync(path.join(OUT, `page-${page}.json`), JSON.stringify(output, null, 2), "utf8");
console.log(`✅ page ${page} -> items ${items.length}`);
return items;
}
(function main() {
const totalPages = 1231; // จาก pagination ในรูป (มีถึง 1231)
const all = [];
const seen = new Set();
for (let p = 1; p <= totalPages; p++) {
const items = scrapeOnePage(p, p === 1); // debug หน้าแรก
for (const it of items) {
const key = `${it.date}|${it.no}|${it.title}|${it.type}|${it.link}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(it);
}
// กันเหนื่อย: ถ้าหน้าไหน 0 แปลว่า url/selector ไม่ตรง ให้หยุดเพื่อ debug
if (p === 1 && items.length === 0) {
console.log("❌ page 1 = 0: เปิด debug-page-1.html แล้วเช็ค buildUrl(menuId/path)");
break;
}
}
fs.writeFileSync(
path.join(OUT, `all.json`),
JSON.stringify(
{ scrapedAt: new Date().toISOString(), totalItems: all.length, items: all },
null,
2
),
"utf8"
);
console.log("✅ Total:", all.length);
})();