migrate-lsw/announcement-results.js

// ผลประกาศ (1238)
// - list: ดึง detailUrl จาก a.listdataconfig_link
// - detail: ดึงไฟล์จาก a.uploadconfig_link (data-href/href)
// - resolve: ยิง /status/1/ เพื่อได้ path จริง แล้วประกอบเป็น URL เต็ม

const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");
const axios = require("axios").default;

const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "ผลประกาศ");
fs.mkdirSync(OUT, { recursive: true });

function curlHtml(url) {
  return execSync(
    `curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
    { encoding: "utf8", maxBuffer: 30 * 1024 * 1024 }
  );
}

function absUrl(href) {
  if (!href) return null;
  const h = href.trim();
  if (!h) return null;
  if (h.startsWith("http")) return h;
  if (h.startsWith("/")) return BASE + h;
  return BASE + "/" + h; // กัน "public/..."
}

function buildListUrl(menuId, page) {
  return `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
}

function detectTotalPages($) {
  let maxPage = 1;
  $("a").each((_, a) => {
    const t = $(a).text().trim();
    if (/^\d+$/.test(t)) maxPage = Math.max(maxPage, Number(t));
  });
  return maxPage;
}

// ยิง api /status/1/ เพื่อเอา path จริง
async function resolveRealFilePath(fileUrl) {
  try {
    const statusUrl = fileUrl.replace(/\/$/, "") + "/status/1/";
    const res = await axios.get(statusUrl, { timeout: 30000 });
    const p = res?.data?.path;
    if (!p) return null;
    return `${BASE}/public/${p.replace(/^\/+/, "")}`;
  } catch {
    return null;
  }
}

// ดึง “ไฟล์ลิงก์” จากหน้า detail (ได้ทั้ง raw และ real)
async function scrapeDetailFile(detailUrl) {
  const html = curlHtml(detailUrl);
  const $ = cheerio.load(html);

  // 1) หาไฟล์จาก uploadconfig_link ก่อน
  let raw =
    ($("a.uploadconfig_link").first().attr("data-href") ||
      $("a.uploadconfig_link").first().attr("href") ||
      "")?.trim();

  // 2) fallback: หา a ที่เป็นไฟล์เอกสาร
  if (!raw) {
    $("a[href], a[data-href]").each((_, a) => {
      if (raw) return;
      const $a = $(a);
      const h = ($a.attr("data-href") || $a.attr("href") || "").trim();
      const full = absUrl(h);
      if (full && /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)(\?|$)/i.test(full)) {
        raw = h;
      }
    });
  }

  const rawUrl = absUrl(raw);
  const realUrl = rawUrl ? await resolveRealFilePath(rawUrl) : null;

  return { rawUrl, realUrl };
}

async function scrapeOnePage(menuId, page, saveHtml = false) {
  const url = buildListUrl(menuId, page);
  const html = curlHtml(url);

  if (saveHtml) {
    fs.writeFileSync(path.join(OUT, `debug-menu-${menuId}-page-${page}.html`), html, "utf8");
  }

  const $ = cheerio.load(html);

  // ✅ เก็บ “แถว” ออกมาเป็น array ก่อน แล้วค่อย await ทีละแถว (กัน .each ไม่ await)
  const rows = $(".row.data-row").toArray();

  const items = [];
  for (const row of rows) {
    const el = $(row);

    const left = el.find(".col-12.col-sm-10").first();
    const a = left.find("a.listdataconfig_link[href]").first();

    const title =
      a.find("label").text().replace(/\s+/g, " ").trim() ||
      a.text().replace(/\s+/g, " ").trim();

    const detailUrl = absUrl(a.attr("href"));

    const date = el
      .find(".col-12.col-sm-2 #show-right-date")
      .text()
      .replace(/\s+/g, " ")
      .trim();

    const icons = [];
    left.find("img").each((_, img) => {
      const src = $(img).attr("src");
      if (src) icons.push(absUrl(src));
    });

    if (!title) continue;

    // ✅ เข้า detail ไปเอาไฟล์จริง
    let file = { rawUrl: null, realUrl: null };
    if (detailUrl) {
      file = await scrapeDetailFile(detailUrl);
    }

    items.push({
      title,
      date: date || null,
      detailUrl: detailUrl || null,
      fileRawUrl: file.rawUrl,
      fileRealUrl: file.realUrl, // ✅ ของจริง
      icons,
      sourcePage: page,
      sourceUrl: url,
    });
  }

  const output = {
    source: url,
    scrapedAt: new Date().toISOString(),
    menuId,
    page,
    count: items.length,
    items,
  };

  fs.writeFileSync(
    path.join(OUT, `list-menu-${menuId}-page-${page}.json`),
    JSON.stringify(output, null, 2),
    "utf8"
  );

  console.log(`✅ page ${page} -> items ${items.length}`);
  return { $, items };
}

(async function main() {
  const menuId = 1238;

  // หน้า 1 เพื่อหา totalPages
  const first = await scrapeOnePage(menuId, 1, true);
  const totalPages = detectTotalPages(first.$);
  console.log("✅ totalPages =", totalPages);

  const all = [];
  const seen = new Set();

  const addItems = (items) => {
    for (const it of items) {
      const key = `${it.title || ""}|${it.date || ""}|${it.detailUrl || ""}`;
      if (seen.has(key)) continue;
      seen.add(key);
      all.push(it);
    }
  };

  addItems(first.items);

  for (let p = 2; p <= totalPages; p++) {
    const { items } = await scrapeOnePage(menuId, p, false);
    addItems(items);
  }

  const merged = {
    menuId,
    totalPages,
    scrapedAt: new Date().toISOString(),
    totalItems: all.length,
    items: all,
  };

  const outAll = path.join(OUT, `list-menu-${menuId}-all.json`);
  fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");

  console.log("🎉 Saved merged JSON:", outAll);
  console.log("🎉 Total unique items:", all.length);
})();