migrate-lsw/forms-download.js

// ดาวน์โหลดแบบฟอร์ม
const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");

const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "ดาวน์โหลดแบบฟอร์ม");
fs.mkdirSync(OUT, { recursive: true });

function curlHtml(url) {
  return execSync(
    `curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
    { encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
  );
}

function absUrl(href) {
  if (!href) return null;
  if (href.startsWith("http")) return href;
  if (href.startsWith("/")) return BASE + href;
  return BASE + "/" + href;
}

function buildUrl(menuId, page) {
  return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
}

// หาเลขหน้ามากสุดจาก pagination
function detectTotalPages($) {
  let maxPage = 1;
  $("a").each((_, a) => {
    const t = $(a).text().trim();
    if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t));
  });
  return maxPage;
}

function pickFileUrlFromAnchor($a) {
  const href = $a.attr("href");
  const dataHref = $a.attr("data-href");
  return absUrl(dataHref || href);
}

function scrapeOnePage(menuId, page, saveHtml = false) {
  const url = buildUrl(menuId, page);
  const html = curlHtml(url);

  if (saveHtml) {
    fs.writeFileSync(path.join(OUT, `debug-menu-${menuId}-page-${page}.html`), html, "utf8");
  }

  const $ = cheerio.load(html);
  const items = [];

  // ✅ เลือก container หลัก: ถ้าไม่มี .row.data-row (เพราะ JS ยังไม่ render) ก็ fallback เป็นทั้งหน้า
  const rows = $(".row.data-row").length ? $(".row.data-row") : $("body");

  rows.each((_, row) => {
    const el = $(row);

    // ✅ หา <a> ที่เป็นไฟล์: รองรับทั้ง href และ data-href + pdf/doc/xls
    const $a =
      el
        .find('a[href*="/public/list/upload/"], a[data-href*="/public/list/upload/"]')
        .first()
        .length
        ? el
            .find('a[href*="/public/list/upload/"], a[data-href*="/public/list/upload/"]')
            .first()
        : el
            .find(
              'a[href$=".pdf"],a[href*=".pdf?"],a[data-href$=".pdf"],a[data-href*=".pdf?"],a[href$=".doc"],a[href$=".docx"],a[href$=".xls"],a[href$=".xlsx"],a[data-href$=".doc"],a[data-href$=".docx"],a[data-href$=".xls"],a[data-href$=".xlsx"]'
            )
            .first();

    if (!$a.length) return;

    const fileUrl = pickFileUrlFromAnchor($a);
    if (!fileUrl) return;

    // ✅ title: เอา text ของแถว/บล็อก แล้วตัด a/img ออก (ให้เหลือชื่อจริง)
    const title = el
      .clone()
      .find("label")
      // .find("a, img, script, style")
      .remove()
      .end()
      .text()
      .replace(/\s+/g, " ")
      .trim();

    // กันกรณี title ว่างเพราะหน้า render แปลก ๆ
    const safeTitle = title || $a.text().replace(/\s+/g, " ").trim() || null;

    items.push({
      title: safeTitle,
      fileUrl,
      filename: fileUrl.split("/").pop() || null,
      sourcePage: page,
      sourceUrl: url,
    });
  });

  // ✅ debug เพิ่ม: ทั้งหน้าเจอ pdf links กี่อัน
  const debugPdfCount = $(
    'a[href$=".pdf"],a[href*=".pdf?"],a[data-href$=".pdf"],a[data-href*=".pdf?"],a[href*="/public/list/upload/"],a[data-href*="/public/list/upload/"]'
  ).length;

  const output = {
    source: url,
    scrapedAt: new Date().toISOString(),
    menuId,
    page,
    count: items.length,
    debugPdfCount,
    items,
  };

  fs.writeFileSync(
    path.join(OUT, `menu-${menuId}-page-${page}.json`),
    JSON.stringify(output, null, 2),
    "utf8"
  );

  console.log(`✅ page ${page} -> items ${items.length} (pdfLinksInHtml=${debugPdfCount})`);
  return { $, items, debugPdfCount };
}

(function main() {
  const menuId = 1210;

  const first = scrapeOnePage(menuId, 1, true);

  // ถ้า page1 ยังไม่มีลิงก์ใน HTML เลย แปลว่าหน้านี้ต้องใช้ JS render (curl จะไม่เห็น)
  if (!first.debugPdfCount && first.items.length === 0) {
    console.log("⚠️ หน้า HTML ที่ curl ได้ยังไม่มีลิงก์ไฟล์ (น่าจะถูก JS render ทีหลัง) -> ต้องใช้วิธีดึง endpoint/หรือใช้ browser automation");
  }

  const totalPages = detectTotalPages(first.$);
  console.log("✅ totalPages =", totalPages);

  const all = [];
  const seen = new Set();
  const addItems = (items) => {
    for (const it of items) {
      const key = `${it.title || ""}|${it.fileUrl || ""}`;
      if (seen.has(key)) continue;
      seen.add(key);
      all.push(it);
    }
  };

  addItems(first.items);

  for (let p = 2; p <= totalPages; p++) {
    const { items } = scrapeOnePage(menuId, p, false);
    addItems(items);
  }

  const merged = {
    menuId,
    totalPages,
    scrapedAt: new Date().toISOString(),
    totalItems: all.length,
    items: all,
  };

  const outAll = path.join(OUT, `menu-${menuId}-all.json`);
  fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");

  console.log("🎉 Saved all:", outAll);
  console.log("🎉 Total unique:", all.length);
})();