// หน้าแรก > งานบริการ > KM องค์กรแห่งการเรียนรู้ const { execSync } = require("child_process"); const cheerio = require("cheerio"); const fs = require("fs"); const path = require("path"); const axios = require("axios").default; const BASE = "https://ladsawai.go.th"; const OUT = path.join(process.cwd(), "KM องค์กรแห่งการเรียนรู้"); fs.mkdirSync(OUT, { recursive: true }); function curlHtml(url) { return execSync( `curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`, { encoding: "utf8", maxBuffer: 30 * 1024 * 1024 } ); } function absUrl(href) { if (!href) return null; if (href.startsWith("http")) return href; if (href.startsWith("/")) return BASE + href; return BASE + "/" + href; } function scrapeDetailImagesContent(detailUrl) { const html = curlHtml(detailUrl); const $ = cheerio.load(html); // ---------- images ---------- const imgSet = new Set(); $(".maingroup.gallery a[href]").each((_, a) => { const href = ($(a).attr("href") || "").trim(); const full = absUrl(href); if (full) imgSet.add(full); }); if (imgSet.size === 0) { $("a[href]").each((_, a) => { const href = ($(a).attr("href") || "").trim(); const full = absUrl(href); if (full && /\.(jpg|jpeg|png|webp|gif)(\?|$)/i.test(full)) imgSet.add(full); }); } // ---------- content ---------- // ✅ เลือกกล่องที่ไม่ใช่ gallery และ "มีข้อความจริง" const candidates = $(".col-12.maingroup").not(".gallery"); let bestBox = null; let bestScore = -1; candidates.each((_, el) => { const $el = $(el); // เอา text โดยตัดของไม่เกี่ยว (emoji img, script, style) const text = $el .clone() .find("img, script, style") .remove() .end() .text() .replace(/\s+/g, " ") .trim(); const pCount = $el.find("p").length; const score = (text ? text.length : 0) + pCount * 50; // ให้ p มีน้ำหนักเพิ่ม if (score > bestScore) { bestScore = score; bestBox = $el; } }); let content = ""; if (bestBox && bestBox.length) { const lines = []; // วนตามลำดับจริงใน DOM: ทั้งหัวข้อ (h2) และเนื้อหา (p) bestBox.find("h2, p").each((_, el) => { const $node = $(el); const tag = ($node.prop("tagName") || "").toLowerCase(); const t = $node .clone() .find("img") // ตัดรูป emoji ใน p/h2 .remove() .end() .text() .replace(/\s+/g, " ") .trim(); if (!t) return; // แยกหัวข้อให้เด่นขึ้นเล็กน้อย (ยังคงเป็น plain text) if (tag === "h2") lines.push(t); else lines.push(t); }); content = lines.length ? lines.join("\n") : bestBox .clone() .find("img, script, style") .remove() .end() .text() .replace(/\s+/g, " ") .trim(); } let mainImageUrl = '' try{ const mainImageDiv = $(".imagestopic img[src]"); const src = ($(mainImageDiv).attr("src") || "").trim(); const full = absUrl(src); if (full) mainImageUrl = full; } catch(error){ mainImageUrl = '' } return { imgs: [...imgSet], text: content, mainImage: mainImageUrl }; } function buildUrl(menuId, page) { return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`; } function detectTotalPages($) { let maxPage = 1; $("a").each((_, a) => { const t = $(a).text().trim(); if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t)); }); return maxPage; } function extractFileLinksFromDetail(detailUrl) { const html = curlHtml(detailUrl); const $ = cheerio.load(html); const files = []; $("a.uploadconfig_link").each((_, a) => { const el = $(a); const href = el.attr("href"); const dataHref = el.attr("data-href"); const fileUrl = absUrl(dataHref || href); if (!fileUrl) return; const text = el.text().replace(/\s+/g, " ").trim() || null let title = text let downloadCount = 0 if(text && text.includes('ดาวน์โหลดแล้ว')){ try { const splitList = text.split(' ดาวน์โหลดแล้ว ') title = splitList[0] downloadCount = parseInt(splitList[1].replace('ครั้ง', '').trim()) } catch (error) { title = text downloadCount = 0 } } files.push({ text: title, url: fileUrl, downloadCount: downloadCount }); }); // fallback: ลิงก์ไฟล์แบบตรง ๆ $("a[href]").each((_, a) => { const href = $(a).attr("href"); const u = absUrl(href); if (!u) return; if (/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(u)) { if (!files.some((f) => f.url === u)) { files.push({ text: $(a).text().trim() || null, url: u }); } } }); return files; } // ✅ ยิง api /status/1/ เพื่อเอา path จริง async function resolveRealFilePath(fileUrl) { try { // กันกรณีมี / ท้ายอยู่แล้ว const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/"; const res = await axios.get(statusUrl, { timeout: 30000 }); return res?.data?.path || null; } catch (e) { return null; } } // ✅ limit concurrency แบบง่าย (กันยิงหนักเกิน) async function mapLimit(arr, limit, mapper) { const ret = []; let i = 0; async function worker() { while (i < arr.length) { const idx = i++; ret[idx] = await mapper(arr[idx], idx); } } const workers = Array.from({ length: Math.min(limit, arr.length) }, worker); await Promise.all(workers); return ret; } async function scrapeOnePage(menuId, page, saveHtml = false) { const url = buildUrl(menuId, page); const html = curlHtml(url); if (saveHtml) { fs.writeFileSync( path.join(OUT, `debug-menu-${menuId}-page-${page}.html`), html, "utf8" ); } const $ = cheerio.load(html); // ✅ แปลง rows เป็น array ก่อน const rows = $(".row.data-row").toArray(); // ✅ ประมวลผลแบบมี limit (เช่น 5 concurrent) const items = (await mapLimit(rows, 5, async (row) => { const el = $(row); const a = el.find("a.listdataconfig_link[href]").first(); if (!a.length) return null; const title = a.find("label.font-weight").text().replace(/\s+/g, " ").trim() || a.text().replace(/\s+/g, " ").trim(); if (!title) return null; const detailUrl = absUrl(a.attr("href")); let files = []; let realPathFiles = [] try { if (detailUrl) files = extractFileLinksFromDetail(detailUrl); for(let i = 0; i < files.length; i++){ const file = files[i] let realPath = null; let fileObject = { fileName: file.text, fileUrl: file.url, // ไฟล์จากหน้า detail filePath: "", // ✅ ของจริงจาก api /status/1/ downloadCount: file.downloadCount } try { const fileUrl = file?.url ? absUrl(file.url) : null; if (fileUrl) { realPath = await resolveRealFilePath(fileUrl); fileObject.filePath = `https://ladsawai.go.th/public/${realPath}` } } catch (error) { realPath = null; } realPathFiles.push(fileObject) } } catch (e) { files = []; } let detail = undefined if(files.length == 0){ const { text, imgs, mainImage } = detailUrl ? scrapeDetailImagesContent(detailUrl) : []; detail = { img: imgs, content: text, mainImage: mainImage } } let detailHtmlPath = '' if (saveHtml) { try { // Extract id and menu from URL: /public/list/data/detail/id/{id}/menu/{menu}/page/{page} const urlMatch = detailUrl.match(/\/id\/(\d+)\/menu\/(\d+)/); const id = urlMatch ? urlMatch[1] : null; // const menu = urlMatch ? urlMatch[2] : null; const detailPageHtml = curlHtml(detailUrl); detailHtmlPath = `debug-menu-${menuId}-detail-${id}.html` fs.writeFileSync( path.join(OUT, detailHtmlPath), detailPageHtml, "utf8" ); } catch (error) { console.error('error :', error) detailHtmlPath = '' } } return { title, detailUrl: detailUrl || null, files: realPathFiles, detail: detail, detailPageHtml: detailHtmlPath ?? undefined, // ไฟล์จากหน้า detail sourcePage: page, sourceUrl: url, }; })) .filter(Boolean); // ตัด null ออก const output = { source: url, scrapedAt: new Date().toISOString(), menuId, page, count: items.length, items, }; fs.writeFileSync( path.join(OUT, `menu-${menuId}-page-${page}.json`), JSON.stringify(output, null, 2), "utf8" ); console.log(`✅ page ${page} -> items ${items.length}`); return { $, items }; } (async function main() { const menuId = 1628; const first = await scrapeOnePage(menuId, 1, true); const totalPages = detectTotalPages(first.$); console.log("✅ totalPages =", totalPages); const all = []; const seen = new Set(); function addItems(items) { for (const it of items) { const key = `${it.title}|${it.detailUrl || ""}|${it.filePath || ""}`; if (seen.has(key)) continue; seen.add(key); all.push(it); } } addItems(first.items); for (let p = 2; p <= totalPages; p++) { const { items } = await scrapeOnePage(menuId, p, false); addItems(items); } const merged = { menuId, totalPages, scrapedAt: new Date().toISOString(), totalItems: all.length, items: all, }; const outAll = path.join(OUT, `menu-${menuId}-all.json`); fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8"); console.log("🎉 Saved all:", outAll); console.log("🎉 Total unique:", all.length); })();