136 lines
4.1 KiB
JavaScript
136 lines
4.1 KiB
JavaScript
// หนังสือราชการ กรมส่งเสริมการปกครองท้องถิ่น
|
|
|
|
const { execSync } = require("child_process");
|
|
const cheerio = require("cheerio");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
|
|
const BASE = "https://ladsawai.go.th";
|
|
const OUT = path.join(process.cwd(), "หนังสือราชการจากท้องถิ่นจังหวัด");
|
|
fs.mkdirSync(OUT, { recursive: true });
|
|
|
|
function curlHtml(url) {
|
|
return execSync(
|
|
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
|
|
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
|
|
);
|
|
}
|
|
|
|
function absUrl(src) {
|
|
if (!src) return null;
|
|
if (src.startsWith("http")) return src;
|
|
if (src.startsWith("/")) return BASE + src;
|
|
return BASE + "/" + src;
|
|
}
|
|
|
|
// ✅ TODO: ใส่ path ให้ตรงของจริง (ดูจาก address bar)
|
|
// ตัวอย่างสมมติ:
|
|
// return `${BASE}/public/dispatch/index/menu/XXXX/page/${page}`;
|
|
function buildUrl(page) {
|
|
const menuId = 1243; // << เปลี่ยนให้ตรงเมนู
|
|
return `${BASE}/public/dispatch/data/index/menu/${menuId}/page/${page}`;
|
|
}
|
|
|
|
function scrapeOnePage(page, saveHtml = false) {
|
|
const url = buildUrl(page);
|
|
const html = curlHtml(url);
|
|
|
|
if (saveHtml) {
|
|
fs.writeFileSync(path.join(OUT, `debug-page-${page}.html`), html, "utf8");
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
const items = [];
|
|
|
|
const rows = $("table.dispatch_table tbody tr.dispatch_odd, table.dispatch_table tbody tr.dispatch_even");
|
|
console.log(`page ${page} rows =`, rows.length);
|
|
|
|
rows.each((_, tr) => {
|
|
const tds = $(tr).find("td.dispatch_normal");
|
|
if (tds.length < 4) return;
|
|
|
|
const date = $(tds[0]).text().replace(/\s+/g, " ").trim();
|
|
const no = $(tds[1]).text().replace(/\s+/g, " ").trim();
|
|
|
|
const topicTd = $(tds[2]);
|
|
const a = topicTd.find("a[href]").first();
|
|
const title = (a.text() || topicTd.text()).replace(/\s+/g, " ").trim();
|
|
const link = absUrl(a.attr("href"));
|
|
|
|
// บางแถวมีไอคอน pdf/ไฟล์
|
|
const fileLinks = [];
|
|
topicTd.find("a[href], img[src]").each((_, el) => {
|
|
const tag = el.tagName?.toLowerCase();
|
|
if (tag === "a") {
|
|
const href = $(el).attr("href");
|
|
if (href) fileLinks.push(absUrl(href));
|
|
} else if (tag === "img") {
|
|
const src = $(el).attr("src");
|
|
if (src) fileLinks.push(absUrl(src));
|
|
}
|
|
});
|
|
|
|
const type = $(tds[3]).text().replace(/\s+/g, " ").trim();
|
|
|
|
if (!title) return;
|
|
|
|
items.push({
|
|
date: date || null,
|
|
no: no || null,
|
|
title,
|
|
type: type || null,
|
|
link: link || null,
|
|
fileLinks: [...new Set(fileLinks)].filter(Boolean),
|
|
sourcePage: page,
|
|
sourceUrl: url,
|
|
});
|
|
});
|
|
|
|
const output = {
|
|
source: url,
|
|
scrapedAt: new Date().toISOString(),
|
|
page,
|
|
count: items.length,
|
|
items,
|
|
};
|
|
|
|
fs.writeFileSync(path.join(OUT, `page-${page}.json`), JSON.stringify(output, null, 2), "utf8");
|
|
console.log(`✅ page ${page} -> items ${items.length}`);
|
|
|
|
return items;
|
|
}
|
|
|
|
(function main() {
|
|
const totalPages = 1231; // จาก pagination ในรูป (มีถึง 1231)
|
|
const all = [];
|
|
const seen = new Set();
|
|
|
|
for (let p = 1; p <= totalPages; p++) {
|
|
const items = scrapeOnePage(p, p === 1); // debug หน้าแรก
|
|
for (const it of items) {
|
|
const key = `${it.date}|${it.no}|${it.title}|${it.type}|${it.link}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
all.push(it);
|
|
}
|
|
|
|
// กันเหนื่อย: ถ้าหน้าไหน 0 แปลว่า url/selector ไม่ตรง ให้หยุดเพื่อ debug
|
|
if (p === 1 && items.length === 0) {
|
|
console.log("❌ page 1 = 0: เปิด debug-page-1.html แล้วเช็ค buildUrl(menuId/path)");
|
|
break;
|
|
}
|
|
}
|
|
|
|
fs.writeFileSync(
|
|
path.join(OUT, `all.json`),
|
|
JSON.stringify(
|
|
{ scrapedAt: new Date().toISOString(), totalItems: all.length, items: all },
|
|
null,
|
|
2
|
|
),
|
|
"utf8"
|
|
);
|
|
|
|
console.log("✅ Total:", all.length);
|
|
})();
|