248 lines
6.3 KiB
JavaScript
248 lines
6.3 KiB
JavaScript
// ข่าวประชาสัมพันธ์
|
|
const { execSync } = require("child_process");
|
|
const cheerio = require("cheerio");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const axios = require("axios").default;
|
|
|
|
const BASE = "https://ladsawai.go.th";
|
|
const OUT = path.join(process.cwd(), "ข่าวประชาสัมพันธ์");
|
|
fs.mkdirSync(OUT, { recursive: true });
|
|
|
|
function curlHtml(url) {
|
|
return execSync(
|
|
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
|
|
{ encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
|
|
);
|
|
}
|
|
|
|
function absUrl(src) {
|
|
if (!src) return null;
|
|
if (src.startsWith("http")) return src;
|
|
if (src.startsWith("/")) return BASE + src;
|
|
return BASE + "/" + src; // กันเคส data-href = "public/...."
|
|
}
|
|
|
|
// ✅ ยิง api /status/1/ เพื่อเอา path จริง
|
|
async function resolveRealFilePath(fileUrl) {
|
|
try {
|
|
const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/";
|
|
const res = await axios.get(statusUrl, { timeout: 30000 });
|
|
return res?.data?.path || null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function scrapeDetailImagesContent(detailUrl) {
|
|
const html = curlHtml(detailUrl);
|
|
const $ = cheerio.load(html);
|
|
|
|
// ---------- files ----------
|
|
const fileSet = new Set();
|
|
|
|
$("a.uploadconfig_link").each((_, a) => {
|
|
const $a = $(a);
|
|
const raw = ($a.attr("data-href") || $a.attr("href") || "").trim();
|
|
const full = absUrl(raw);
|
|
if (full) fileSet.add(full);
|
|
});
|
|
|
|
if (fileSet.size === 0) {
|
|
$("a[href], a[data-href]").each((_, a) => {
|
|
const $a = $(a);
|
|
const raw = ($a.attr("data-href") || $a.attr("href") || "").trim();
|
|
const full = absUrl(raw);
|
|
if (!full) return;
|
|
|
|
if (/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(full)) {
|
|
fileSet.add(full);
|
|
}
|
|
});
|
|
}
|
|
|
|
// ✅ Set ต้องแปลงเป็น Array ก่อนเอา [0]
|
|
const firstFileUrl = Array.from(fileSet)[0] || null;
|
|
|
|
let realPath = null;
|
|
if (firstFileUrl) {
|
|
const p = await resolveRealFilePath(firstFileUrl);
|
|
realPath = p ? `https://ladsawai.go.th/public/${p}` : null;
|
|
}
|
|
|
|
// ---------- images ----------
|
|
const imgSet = new Set();
|
|
|
|
$(".maingroup.gallery a[href]").each((_, a) => {
|
|
const href = ($(a).attr("href") || "").trim();
|
|
const full = absUrl(href);
|
|
if (full) imgSet.add(full);
|
|
});
|
|
|
|
if (imgSet.size === 0) {
|
|
$("a[href]").each((_, a) => {
|
|
const href = ($(a).attr("href") || "").trim();
|
|
const full = absUrl(href);
|
|
if (full && /\.(jpg|jpeg|png|webp|gif)(\?|$)/i.test(full)) imgSet.add(full);
|
|
});
|
|
}
|
|
|
|
// ---------- content ----------
|
|
const candidates = $(".col-12.maingroup").not(".gallery");
|
|
|
|
let bestBox = null;
|
|
let bestScore = -1;
|
|
|
|
candidates.each((_, el) => {
|
|
const $el = $(el);
|
|
const text = $el
|
|
.clone()
|
|
.find("img, script, style")
|
|
.remove()
|
|
.end()
|
|
.text()
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
|
|
const pCount = $el.find("p").length;
|
|
const score = (text ? text.length : 0) + pCount * 50;
|
|
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestBox = $el;
|
|
}
|
|
});
|
|
|
|
let content = "";
|
|
if (bestBox && bestBox.length) {
|
|
const lines = [];
|
|
bestBox.find("p").each((_, p) => {
|
|
const t = $(p)
|
|
.clone()
|
|
.find("img")
|
|
.remove()
|
|
.end()
|
|
.text()
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
if (t) lines.push(t);
|
|
});
|
|
|
|
content = lines.length
|
|
? lines.join("\n")
|
|
: bestBox
|
|
.clone()
|
|
.find("img, script, style")
|
|
.remove()
|
|
.end()
|
|
.text()
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
return { imgs: [...imgSet], text: content, files: realPath };
|
|
}
|
|
|
|
// ✅ ต้องเป็น async เพื่อ await detail
|
|
async function scrapeOnePage(menuId, page, saveHtml = false) {
|
|
const url = `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
|
|
const html = curlHtml(url);
|
|
|
|
if (saveHtml) {
|
|
fs.writeFileSync(path.join(OUT, `page-menu-${menuId}-page-${page}.html`), html, "utf8");
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
const items = [];
|
|
|
|
// ✅ ห้ามใช้ .each(async ...) ให้ใช้ loop ปกติแทน
|
|
const rows = $(".row.data-row").toArray();
|
|
for (const row of rows) {
|
|
const el = $(row);
|
|
|
|
const title = el.find(".col-sm-8").text().replace(/\s+/g, " ").trim();
|
|
if (!title) continue;
|
|
|
|
const href = (el.find("a.listdataconfig_link").attr("href") || "").trim();
|
|
if (!href) continue;
|
|
|
|
const linkD = absUrl(href);
|
|
|
|
const date = el.find(".col-sm-2").last().text().trim();
|
|
const imgSrc = el.find("img").attr("src");
|
|
|
|
let detail = { imgs: [], text: "", files: null };
|
|
try {
|
|
detail = linkD ? await scrapeDetailImagesContent(linkD) : detail;
|
|
} catch {
|
|
// กันหน้า detail บางอันพัง
|
|
}
|
|
|
|
items.push({
|
|
title,
|
|
detailRef: linkD,
|
|
detail: {
|
|
img: detail.imgs,
|
|
content: detail.text,
|
|
link: detail.files,
|
|
},
|
|
date: date || null,
|
|
image: absUrl(imgSrc),
|
|
sourcePage: page,
|
|
sourceUrl: url,
|
|
});
|
|
}
|
|
|
|
const output = {
|
|
source: url,
|
|
scrapedAt: new Date().toISOString(),
|
|
menuId,
|
|
page,
|
|
count: items.length,
|
|
items,
|
|
};
|
|
|
|
fs.writeFileSync(
|
|
path.join(OUT, `list-menu-${menuId}-page-${page}.json`),
|
|
JSON.stringify(output, null, 2),
|
|
"utf8"
|
|
);
|
|
|
|
console.log(`✅ page ${page} -> items ${items.length}`);
|
|
return items;
|
|
}
|
|
|
|
// ✅ main ต้อง async เพื่อ await scrapeOnePage
|
|
(async function main() {
|
|
const menuId = 1554;
|
|
const totalPages = 53;
|
|
|
|
const all = [];
|
|
const seen = new Set();
|
|
|
|
for (let page = 1; page <= totalPages; page++) {
|
|
const items = await scrapeOnePage(menuId, page, false);
|
|
|
|
for (const it of items) {
|
|
const key = `${it.title}|${it.date || ""}|${it.image || ""}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
all.push(it);
|
|
}
|
|
}
|
|
|
|
const merged = {
|
|
menuId,
|
|
totalPages,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalItems: all.length,
|
|
items: all,
|
|
};
|
|
|
|
const outAll = path.join(OUT, `list-menu-${menuId}-all.json`);
|
|
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
|
|
|
|
console.log("✅ Saved merged JSON:", outAll);
|
|
console.log("✅ Total unique items:", all.length);
|
|
})();
|