222 lines
5.9 KiB
JavaScript
222 lines
5.9 KiB
JavaScript
// รายงานติดตามและประเมินผลแผนพัฒนา
|
|
|
|
const { execSync } = require("child_process");
|
|
const cheerio = require("cheerio");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const axios = require("axios").default;
|
|
|
|
const BASE = "https://ladsawai.go.th";
|
|
const OUT = path.join(process.cwd(), "รายงานติดตามและประเมินผลแผนพัฒนา");
|
|
fs.mkdirSync(OUT, { recursive: true });
|
|
|
|
function curlHtml(url) {
|
|
return execSync(
|
|
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
|
|
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
|
|
);
|
|
}
|
|
|
|
function absUrl(href) {
|
|
if (!href) return null;
|
|
if (href.startsWith("http")) return href;
|
|
if (href.startsWith("/")) return BASE + href;
|
|
return BASE + "/" + href;
|
|
}
|
|
|
|
function buildUrl(menuId, catid, page) {
|
|
// return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
|
|
return `${BASE}/public/list/data/datacategory/catid/${catid}/menu/${menuId}/page/${page}`;
|
|
}
|
|
|
|
function detectTotalPages($) {
|
|
let maxPage = 1;
|
|
$("a").each((_, a) => {
|
|
const t = $(a).text().trim();
|
|
if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t));
|
|
});
|
|
return maxPage;
|
|
}
|
|
|
|
function extractFileLinksFromDetail(detailUrl) {
|
|
const html = curlHtml(detailUrl);
|
|
const $ = cheerio.load(html);
|
|
|
|
const files = [];
|
|
|
|
$("a.uploadconfig_link").each((_, a) => {
|
|
const el = $(a);
|
|
const href = el.attr("href");
|
|
const dataHref = el.attr("data-href");
|
|
const fileUrl = absUrl(dataHref || href);
|
|
if (!fileUrl) return;
|
|
|
|
files.push({
|
|
text: el.text().replace(/\s+/g, " ").trim() || null,
|
|
url: fileUrl,
|
|
});
|
|
});
|
|
|
|
// fallback: ลิงก์ไฟล์แบบตรง ๆ
|
|
$("a[href]").each((_, a) => {
|
|
const href = $(a).attr("href");
|
|
const u = absUrl(href);
|
|
if (!u) return;
|
|
|
|
if (/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(u)) {
|
|
if (!files.some((f) => f.url === u)) {
|
|
files.push({ text: $(a).text().trim() || null, url: u });
|
|
}
|
|
}
|
|
});
|
|
|
|
return files;
|
|
}
|
|
|
|
// ✅ ยิง api /status/1/ เพื่อเอา path จริง
|
|
async function resolveRealFilePath(fileUrl) {
|
|
try {
|
|
// กันกรณีมี / ท้ายอยู่แล้ว
|
|
const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/";
|
|
const res = await axios.get(statusUrl, { timeout: 30000 });
|
|
return res?.data?.path || null;
|
|
} catch (e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ✅ limit concurrency แบบง่าย (กันยิงหนักเกิน)
|
|
async function mapLimit(arr, limit, mapper) {
|
|
const ret = [];
|
|
let i = 0;
|
|
|
|
async function worker() {
|
|
while (i < arr.length) {
|
|
const idx = i++;
|
|
ret[idx] = await mapper(arr[idx], idx);
|
|
}
|
|
}
|
|
|
|
const workers = Array.from({ length: Math.min(limit, arr.length) }, worker);
|
|
await Promise.all(workers);
|
|
return ret;
|
|
}
|
|
|
|
async function scrapeOnePage(menuId, catid, page, saveHtml = false) {
|
|
const url = buildUrl(menuId, catid, page);
|
|
const html = curlHtml(url);
|
|
|
|
if (saveHtml) {
|
|
fs.writeFileSync(
|
|
path.join(OUT, `debug-menu-${menuId}-catid-${catid}-page-${page}.html`),
|
|
html,
|
|
"utf8"
|
|
);
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
// ✅ แปลง rows เป็น array ก่อน
|
|
const rows = $(".row.data-row").toArray();
|
|
|
|
// ✅ ประมวลผลแบบมี limit (เช่น 5 concurrent)
|
|
const items = (await mapLimit(rows, 5, async (row) => {
|
|
const el = $(row);
|
|
const a = el.find("a.listdataconfig_link[href]").first();
|
|
if (!a.length) return null;
|
|
|
|
const title =
|
|
a.find("label.font-weight").text().replace(/\s+/g, " ").trim() ||
|
|
a.text().replace(/\s+/g, " ").trim();
|
|
|
|
if (!title) return null;
|
|
|
|
const detailUrl = absUrl(a.attr("href"));
|
|
let files = [];
|
|
let realPath = null;
|
|
|
|
try {
|
|
if (detailUrl) files = extractFileLinksFromDetail(detailUrl);
|
|
const firstFileUrl = files?.[0]?.url ? absUrl(files[0].url) : null;
|
|
if (firstFileUrl) {
|
|
realPath = await resolveRealFilePath(firstFileUrl);
|
|
}
|
|
} catch (e) {
|
|
files = [];
|
|
realPath = null;
|
|
}
|
|
|
|
return {
|
|
title,
|
|
detailUrl: detailUrl || null,
|
|
fileUrl: files?.[0]?.url ? absUrl(files[0].url) : null, // ไฟล์จากหน้า detail
|
|
filePath: `https://ladsawai.go.th/public/` + realPath, // ✅ ของจริงจาก api /status/1/
|
|
sourcePage: page,
|
|
sourceUrl: url,
|
|
};
|
|
}))
|
|
.filter(Boolean); // ตัด null ออก
|
|
|
|
const output = {
|
|
source: url,
|
|
scrapedAt: new Date().toISOString(),
|
|
menuId,
|
|
catid,
|
|
page,
|
|
count: items.length,
|
|
items,
|
|
};
|
|
|
|
fs.writeFileSync(
|
|
path.join(OUT, `menu-${menuId}-catid-${catid}-page-${page}.json`),
|
|
JSON.stringify(output, null, 2),
|
|
"utf8"
|
|
);
|
|
|
|
console.log(`✅ page ${page} -> items ${items.length}`);
|
|
return { $, items };
|
|
}
|
|
|
|
(async function main() {
|
|
const menuId = 1196;
|
|
const catid = 7;
|
|
|
|
const first = await scrapeOnePage(menuId, catid, 1, true);
|
|
const totalPages = detectTotalPages(first.$);
|
|
console.log("✅ totalPages =", totalPages);
|
|
|
|
const all = [];
|
|
const seen = new Set();
|
|
|
|
function addItems(items) {
|
|
for (const it of items) {
|
|
const key = `${it.title}|${it.detailUrl || ""}|${it.filePath || ""}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
all.push(it);
|
|
}
|
|
}
|
|
|
|
addItems(first.items);
|
|
|
|
for (let p = 2; p <= totalPages; p++) {
|
|
const { items } = await scrapeOnePage(menuId, catid, p, false);
|
|
addItems(items);
|
|
}
|
|
|
|
const merged = {
|
|
menuId,
|
|
catid,
|
|
totalPages,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalItems: all.length,
|
|
items: all,
|
|
};
|
|
|
|
const outAll = path.join(OUT, `menu-${menuId}-catid-${catid}-all.json`);
|
|
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
|
|
|
|
console.log("🎉 Saved all:", outAll);
|
|
console.log("🎉 Total unique:", all.length);
|
|
})();
|