migrate-lsw/municipal-ordinances-orders.js
2026-01-13 10:17:00 +07:00

222 lines
5.9 KiB
JavaScript

// เทศบัญญัติและคำสั่งเทศบาล
const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");
const axios = require("axios").default;
const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "เทศบัญญัติและคำสั่งเทศบาล");
fs.mkdirSync(OUT, { recursive: true });
function curlHtml(url) {
return execSync(
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
);
}
function absUrl(href) {
if (!href) return null;
if (href.startsWith("http")) return href;
if (href.startsWith("/")) return BASE + href;
return BASE + "/" + href;
}
function buildUrl(menuId, catid, page) {
// return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
return `${BASE}/public/list/data/datacategory/catid/${catid}/menu/${menuId}/page/${page}`;
}
function detectTotalPages($) {
let maxPage = 1;
$("a").each((_, a) => {
const t = $(a).text().trim();
if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t));
});
return maxPage;
}
function extractFileLinksFromDetail(detailUrl) {
const html = curlHtml(detailUrl);
const $ = cheerio.load(html);
const files = [];
$("a.uploadconfig_link").each((_, a) => {
const el = $(a);
const href = el.attr("href");
const dataHref = el.attr("data-href");
const fileUrl = absUrl(dataHref || href);
if (!fileUrl) return;
files.push({
text: el.text().replace(/\s+/g, " ").trim() || null,
url: fileUrl,
});
});
// fallback: ลิงก์ไฟล์แบบตรง ๆ
$("a[href]").each((_, a) => {
const href = $(a).attr("href");
const u = absUrl(href);
if (!u) return;
if (/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(u)) {
if (!files.some((f) => f.url === u)) {
files.push({ text: $(a).text().trim() || null, url: u });
}
}
});
return files;
}
// ✅ ยิง api /status/1/ เพื่อเอา path จริง
async function resolveRealFilePath(fileUrl) {
try {
// กันกรณีมี / ท้ายอยู่แล้ว
const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/";
const res = await axios.get(statusUrl, { timeout: 30000 });
return res?.data?.path || null;
} catch (e) {
return null;
}
}
// ✅ limit concurrency แบบง่าย (กันยิงหนักเกิน)
async function mapLimit(arr, limit, mapper) {
const ret = [];
let i = 0;
async function worker() {
while (i < arr.length) {
const idx = i++;
ret[idx] = await mapper(arr[idx], idx);
}
}
const workers = Array.from({ length: Math.min(limit, arr.length) }, worker);
await Promise.all(workers);
return ret;
}
async function scrapeOnePage(menuId, catid, page, saveHtml = false) {
const url = buildUrl(menuId, catid, page);
const html = curlHtml(url);
if (saveHtml) {
fs.writeFileSync(
path.join(OUT, `debug-menu-${menuId}-catid-${catid}-page-${page}.html`),
html,
"utf8"
);
}
const $ = cheerio.load(html);
// ✅ แปลง rows เป็น array ก่อน
const rows = $(".row.data-row").toArray();
// ✅ ประมวลผลแบบมี limit (เช่น 5 concurrent)
const items = (await mapLimit(rows, 5, async (row) => {
const el = $(row);
const a = el.find("a.listdataconfig_link[href]").first();
if (!a.length) return null;
const title =
a.find("label.font-weight").text().replace(/\s+/g, " ").trim() ||
a.text().replace(/\s+/g, " ").trim();
if (!title) return null;
const detailUrl = absUrl(a.attr("href"));
let files = [];
let realPath = null;
try {
if (detailUrl) files = extractFileLinksFromDetail(detailUrl);
const firstFileUrl = files?.[0]?.url ? absUrl(files[0].url) : null;
if (firstFileUrl) {
realPath = await resolveRealFilePath(firstFileUrl);
}
} catch (e) {
files = [];
realPath = null;
}
return {
title,
detailUrl: detailUrl || null,
fileUrl: files?.[0]?.url ? absUrl(files[0].url) : null, // ไฟล์จากหน้า detail
filePath: `https://ladsawai.go.th/public/` + realPath, // ✅ ของจริงจาก api /status/1/
sourcePage: page,
sourceUrl: url,
};
}))
.filter(Boolean); // ตัด null ออก
const output = {
source: url,
scrapedAt: new Date().toISOString(),
menuId,
catid,
page,
count: items.length,
items,
};
fs.writeFileSync(
path.join(OUT, `menu-${menuId}-catid-${catid}-page-${page}.json`),
JSON.stringify(output, null, 2),
"utf8"
);
console.log(`✅ page ${page} -> items ${items.length}`);
return { $, items };
}
(async function main() {
const menuId = 1539;
const catid = 66;
const first = await scrapeOnePage(menuId, catid, 1, true);
const totalPages = detectTotalPages(first.$);
console.log("✅ totalPages =", totalPages);
const all = [];
const seen = new Set();
function addItems(items) {
for (const it of items) {
const key = `${it.title}|${it.detailUrl || ""}|${it.filePath || ""}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(it);
}
}
addItems(first.items);
for (let p = 2; p <= totalPages; p++) {
const { items } = await scrapeOnePage(menuId, catid, p, false);
addItems(items);
}
const merged = {
menuId,
catid,
totalPages,
scrapedAt: new Date().toISOString(),
totalItems: all.length,
items: all,
};
const outAll = path.join(OUT, `menu-${menuId}-catid-${catid}-all.json`);
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
console.log("🎉 Saved all:", outAll);
console.log("🎉 Total unique:", all.length);
})();