// ข่าวประชาสัมพันธ์ const { execSync } = require("child_process"); const cheerio = require("cheerio"); const fs = require("fs"); const path = require("path"); const axios = require("axios").default; const BASE = "https://ladsawai.go.th"; const OUT = path.join(process.cwd(), "ข่าวประชาสัมพันธ์"); fs.mkdirSync(OUT, { recursive: true }); function curlHtml(url) { return execSync( `curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`, { encoding: "utf8", maxBuffer: 30 * 1024 * 1024 } ); } function absUrl(src) { if (!src) return null; if (src.startsWith("http")) return src; if (src.startsWith("/")) return BASE + src; return BASE + "/" + src; // กันเคส data-href = "public/...." } // ✅ ยิง api /status/1/ เพื่อเอา path จริง async function resolveRealFilePath(fileUrl) { try { const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/"; const res = await axios.get(statusUrl, { timeout: 30000 }); return res?.data?.path || null; } catch { return null; } } async function scrapeDetailImagesContent(detailUrl) { const html = curlHtml(detailUrl); const $ = cheerio.load(html); // ---------- files ---------- const fileSet = new Set(); $("a.uploadconfig_link").each((_, a) => { const $a = $(a); const raw = ($a.attr("data-href") || $a.attr("href") || "").trim(); const full = absUrl(raw); if (full) fileSet.add(full); }); if (fileSet.size === 0) { $("a[href], a[data-href]").each((_, a) => { const $a = $(a); const raw = ($a.attr("data-href") || $a.attr("href") || "").trim(); const full = absUrl(raw); if (!full) return; if (/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(full)) { fileSet.add(full); } }); } // ✅ Set ต้องแปลงเป็น Array ก่อนเอา [0] const firstFileUrl = Array.from(fileSet)[0] || null; let realPath = null; if (firstFileUrl) { const p = await resolveRealFilePath(firstFileUrl); realPath = p ? `https://ladsawai.go.th/public/${p}` : null; } // ---------- images ---------- const imgSet = new Set(); $(".maingroup.gallery a[href]").each((_, a) => { const href = ($(a).attr("href") || "").trim(); const full = absUrl(href); if (full) imgSet.add(full); }); if (imgSet.size === 0) { $("a[href]").each((_, a) => { const href = ($(a).attr("href") || "").trim(); const full = absUrl(href); if (full && /\.(jpg|jpeg|png|webp|gif)(\?|$)/i.test(full)) imgSet.add(full); }); } // ---------- content ---------- const candidates = $(".col-12.maingroup").not(".gallery"); let bestBox = null; let bestScore = -1; candidates.each((_, el) => { const $el = $(el); const text = $el .clone() .find("img, script, style") .remove() .end() .text() .replace(/\s+/g, " ") .trim(); const pCount = $el.find("p").length; const score = (text ? text.length : 0) + pCount * 50; if (score > bestScore) { bestScore = score; bestBox = $el; } }); let content = ""; if (bestBox && bestBox.length) { const lines = []; bestBox.find("p").each((_, p) => { const t = $(p) .clone() .find("img") .remove() .end() .text() .replace(/\s+/g, " ") .trim(); if (t) lines.push(t); }); content = lines.length ? lines.join("\n") : bestBox .clone() .find("img, script, style") .remove() .end() .text() .replace(/\s+/g, " ") .trim(); } return { imgs: [...imgSet], text: content, files: realPath }; } // ✅ ต้องเป็น async เพื่อ await detail async function scrapeOnePage(menuId, page, saveHtml = false) { const url = `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`; const html = curlHtml(url); if (saveHtml) { fs.writeFileSync(path.join(OUT, `page-menu-${menuId}-page-${page}.html`), html, "utf8"); } const $ = cheerio.load(html); const items = []; // ✅ ห้ามใช้ .each(async ...) ให้ใช้ loop ปกติแทน const rows = $(".row.data-row").toArray(); for (const row of rows) { const el = $(row); const title = el.find(".col-sm-8").text().replace(/\s+/g, " ").trim(); if (!title) continue; const href = (el.find("a.listdataconfig_link").attr("href") || "").trim(); if (!href) continue; const linkD = absUrl(href); const date = el.find(".col-sm-2").last().text().trim(); const imgSrc = el.find("img").attr("src"); let detail = { imgs: [], text: "", files: null }; try { detail = linkD ? await scrapeDetailImagesContent(linkD) : detail; } catch { // กันหน้า detail บางอันพัง } items.push({ title, detailRef: linkD, detail: { img: detail.imgs, content: detail.text, link: detail.files, }, date: date || null, image: absUrl(imgSrc), sourcePage: page, sourceUrl: url, }); } const output = { source: url, scrapedAt: new Date().toISOString(), menuId, page, count: items.length, items, }; fs.writeFileSync( path.join(OUT, `list-menu-${menuId}-page-${page}.json`), JSON.stringify(output, null, 2), "utf8" ); console.log(`✅ page ${page} -> items ${items.length}`); return items; } // ✅ main ต้อง async เพื่อ await scrapeOnePage (async function main() { const menuId = 1554; const totalPages = 53; const all = []; const seen = new Set(); for (let page = 1; page <= totalPages; page++) { const items = await scrapeOnePage(menuId, page, false); for (const it of items) { const key = `${it.title}|${it.date || ""}|${it.image || ""}`; if (seen.has(key)) continue; seen.add(key); all.push(it); } } const merged = { menuId, totalPages, scrapedAt: new Date().toISOString(), totalItems: all.length, items: all, }; const outAll = path.join(OUT, `list-menu-${menuId}-all.json`); fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8"); console.log("✅ Saved merged JSON:", outAll); console.log("✅ Total unique items:", all.length); })();