206 lines
5.7 KiB
JavaScript
206 lines
5.7 KiB
JavaScript
// ผลประกาศ (1238)
|
|
// - list: ดึง detailUrl จาก a.listdataconfig_link
|
|
// - detail: ดึงไฟล์จาก a.uploadconfig_link (data-href/href)
|
|
// - resolve: ยิง /status/1/ เพื่อได้ path จริง แล้วประกอบเป็น URL เต็ม
|
|
|
|
const { execSync } = require("child_process");
|
|
const cheerio = require("cheerio");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const axios = require("axios").default;
|
|
|
|
const BASE = "https://ladsawai.go.th";
|
|
const OUT = path.join(process.cwd(), "ผลประกาศ");
|
|
fs.mkdirSync(OUT, { recursive: true });
|
|
|
|
function curlHtml(url) {
|
|
return execSync(
|
|
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
|
|
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
|
|
);
|
|
}
|
|
|
|
function absUrl(href) {
|
|
if (!href) return null;
|
|
const h = href.trim();
|
|
if (!h) return null;
|
|
if (h.startsWith("http")) return h;
|
|
if (h.startsWith("/")) return BASE + h;
|
|
return BASE + "/" + h; // กัน "public/..."
|
|
}
|
|
|
|
function buildListUrl(menuId, page) {
|
|
return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
|
|
}
|
|
|
|
function detectTotalPages($) {
|
|
let maxPage = 1;
|
|
$("a").each((_, a) => {
|
|
const t = $(a).text().trim();
|
|
if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t));
|
|
});
|
|
return maxPage;
|
|
}
|
|
|
|
// ยิง api /status/1/ เพื่อเอา path จริง
|
|
async function resolveRealFilePath(fileUrl) {
|
|
try {
|
|
const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/";
|
|
const res = await axios.get(statusUrl, { timeout: 30000 });
|
|
const p = res?.data?.path;
|
|
if (!p) return null;
|
|
return `${BASE}/public/${p.replace(/^\/+/, "")}`;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ดึง “ไฟล์ลิงก์” จากหน้า detail (ได้ทั้ง raw และ real)
|
|
async function scrapeDetailFile(detailUrl) {
|
|
const html = curlHtml(detailUrl);
|
|
const $ = cheerio.load(html);
|
|
|
|
// 1) หาไฟล์จาก uploadconfig_link ก่อน
|
|
let raw =
|
|
($("a.uploadconfig_link").first().attr("data-href") ||
|
|
$("a.uploadconfig_link").first().attr("href") ||
|
|
"")?.trim();
|
|
|
|
// 2) fallback: หา a ที่เป็นไฟล์เอกสาร
|
|
if (!raw) {
|
|
$("a[href], a[data-href]").each((_, a) => {
|
|
if (raw) return;
|
|
const $a = $(a);
|
|
const h = ($a.attr("data-href") || $a.attr("href") || "").trim();
|
|
const full = absUrl(h);
|
|
if (full && /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(full)) {
|
|
raw = h;
|
|
}
|
|
});
|
|
}
|
|
|
|
const rawUrl = absUrl(raw);
|
|
const realUrl = rawUrl ? await resolveRealFilePath(rawUrl) : null;
|
|
|
|
return { rawUrl, realUrl };
|
|
}
|
|
|
|
async function scrapeOnePage(menuId, page, saveHtml = false) {
|
|
const url = buildListUrl(menuId, page);
|
|
const html = curlHtml(url);
|
|
|
|
if (saveHtml) {
|
|
fs.writeFileSync(path.join(OUT, `debug-menu-${menuId}-page-${page}.html`), html, "utf8");
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
// ✅ เก็บ “แถว” ออกมาเป็น array ก่อน แล้วค่อย await ทีละแถว (กัน .each ไม่ await)
|
|
const rows = $(".row.data-row").toArray();
|
|
|
|
const items = [];
|
|
for (const row of rows) {
|
|
const el = $(row);
|
|
|
|
const left = el.find(".col-12.col-sm-10").first();
|
|
const a = left.find("a.listdataconfig_link[href]").first();
|
|
|
|
const title =
|
|
a.find("label").text().replace(/\s+/g, " ").trim() ||
|
|
a.text().replace(/\s+/g, " ").trim();
|
|
|
|
const detailUrl = absUrl(a.attr("href"));
|
|
|
|
const date = el
|
|
.find(".col-12.col-sm-2 #show-right-date")
|
|
.text()
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
|
|
const icons = [];
|
|
left.find("img").each((_, img) => {
|
|
const src = $(img).attr("src");
|
|
if (src) icons.push(absUrl(src));
|
|
});
|
|
|
|
if (!title) continue;
|
|
|
|
// ✅ เข้า detail ไปเอาไฟล์จริง
|
|
let file = { rawUrl: null, realUrl: null };
|
|
if (detailUrl) {
|
|
file = await scrapeDetailFile(detailUrl);
|
|
}
|
|
|
|
items.push({
|
|
title,
|
|
date: date || null,
|
|
detailUrl: detailUrl || null,
|
|
fileRawUrl: file.rawUrl,
|
|
fileRealUrl: file.realUrl, // ✅ ของจริง
|
|
icons,
|
|
sourcePage: page,
|
|
sourceUrl: url,
|
|
});
|
|
}
|
|
|
|
const output = {
|
|
source: url,
|
|
scrapedAt: new Date().toISOString(),
|
|
menuId,
|
|
page,
|
|
count: items.length,
|
|
items,
|
|
};
|
|
|
|
fs.writeFileSync(
|
|
path.join(OUT, `list-menu-${menuId}-page-${page}.json`),
|
|
JSON.stringify(output, null, 2),
|
|
"utf8"
|
|
);
|
|
|
|
console.log(`✅ page ${page} -> items ${items.length}`);
|
|
return { $, items };
|
|
}
|
|
|
|
(async function main() {
|
|
const menuId = 1238;
|
|
|
|
// หน้า 1 เพื่อหา totalPages
|
|
const first = await scrapeOnePage(menuId, 1, true);
|
|
const totalPages = detectTotalPages(first.$);
|
|
console.log("✅ totalPages =", totalPages);
|
|
|
|
const all = [];
|
|
const seen = new Set();
|
|
|
|
const addItems = (items) => {
|
|
for (const it of items) {
|
|
const key = `${it.title || ""}|${it.date || ""}|${it.detailUrl || ""}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
all.push(it);
|
|
}
|
|
};
|
|
|
|
addItems(first.items);
|
|
|
|
for (let p = 2; p <= totalPages; p++) {
|
|
const { items } = await scrapeOnePage(menuId, p, false);
|
|
addItems(items);
|
|
}
|
|
|
|
const merged = {
|
|
menuId,
|
|
totalPages,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalItems: all.length,
|
|
items: all,
|
|
};
|
|
|
|
const outAll = path.join(OUT, `list-menu-${menuId}-all.json`);
|
|
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
|
|
|
|
console.log("🎉 Saved merged JSON:", outAll);
|
|
console.log("🎉 Total unique items:", all.length);
|
|
})();
|