migrate-lsw/award_of_pride.js

222 lines
5.7 KiB
JavaScript

// หน้าแรก > งานบริการ > รางวัลแห่งความภูมิใจ
const { execSync } = require("child_process");
const cheerio = require("cheerio");
const fs = require("fs");
const path = require("path");
const BASE = "https://ladsawai.go.th";
const OUT = path.join(process.cwd(), "รางวัลแห่งความภูมิใจ");
fs.mkdirSync(OUT, { recursive: true });
function curlHtml(url) {
return execSync(
`curl -L -s "${url}" -H "User-Agent: Mozilla/5.0" -H "Accept-Language: th-TH,th;q=0.9"`,
{ encoding: "utf8", maxBuffer: 20 * 1024 * 1024 }
);
}
function absUrl(src) {
if (!src) return null;
if (src.startsWith("http")) return src;
return BASE + src;
}
function scrapeDetailImagesContent(detailUrl) {
const html = curlHtml(detailUrl);
const $ = cheerio.load(html);
// ---------- images ----------
const imgSet = new Set();
$(".maingroup.gallery a[href]").each((_, a) => {
const href = ($(a).attr("href") || "").trim();
const full = absUrl(href);
if (full) imgSet.add(full);
});
if (imgSet.size === 0) {
$("a[href]").each((_, a) => {
const href = ($(a).attr("href") || "").trim();
const full = absUrl(href);
if (full && /\.(jpg|jpeg|png|webp|gif)(\?|$)/i.test(full)) imgSet.add(full);
});
}
// ---------- content ----------
// ✅ เลือกกล่องที่ไม่ใช่ gallery และ "มีข้อความจริง"
const candidates = $(".col-12.maingroup").not(".gallery");
let bestBox = null;
let bestScore = -1;
candidates.each((_, el) => {
const $el = $(el);
// เอา text โดยตัดของไม่เกี่ยว (emoji img, script, style)
const text = $el
.clone()
.find("img, script, style")
.remove()
.end()
.text()
.replace(/\s+/g, " ")
.trim();
const pCount = $el.find("p").length;
const score = (text ? text.length : 0) + pCount * 50; // ให้ p มีน้ำหนักเพิ่ม
if (score > bestScore) {
bestScore = score;
bestBox = $el;
}
});
let content = "";
if (bestBox && bestBox.length) {
const lines = [];
bestBox.find("p").each((_, p) => {
const t = $(p)
.clone()
.find("img") // ตัดรูป emoji ใน p
.remove()
.end()
.text()
.replace(/\s+/g, " ")
.trim();
if (t) lines.push(t);
});
content = lines.length
? lines.join("\n")
: bestBox
.clone()
.find("img, script, style")
.remove()
.end()
.text()
.replace(/\s+/g, " ")
.trim();
}
let mainImageUrl = ''
try{
const mainImageDiv = $(".imagestopic img[src]");
const src = ($(mainImageDiv).attr("src") || "").trim();
const full = absUrl(src);
if (full) mainImageUrl = full;
}
catch(error){
mainImageUrl = ''
}
return { imgs: [...imgSet], text: content, mainImage: mainImageUrl };
}
function scrapeOnePage(menuId, page, saveHtml = false) {
const url = `${BASE}/public/list/data/index/menu/${menuId}/page/${page}`;
const html = curlHtml(url);
if (saveHtml) {
fs.writeFileSync(path.join(OUT, `page-menu-${menuId}-page-${page}.html`), html, "utf8");
}
const $ = cheerio.load(html);
const items = [];
$(".row.data-row").each((_, row) => {
const el = $(row);
const title = el
.find(".col-12.col-sm-10")
.text()
.replace(/\s+/g, " ")
.trim();
const detailRef = el
.find("a.listdataconfig_link ") // a.listdataconfig_link
.attr("href")
.trim();
const date = el.find(".col-sm-2").last().text().trim();
const imgSrc = el.find("img").attr("src");
if (!title) return;
const linkD = `https://ladsawai.go.th` + detailRef
const { text, imgs, mainImage } = linkD ? scrapeDetailImagesContent(linkD) : [];
items.push({
title,
detailRef: linkD,
detail:{
img: imgs,
content: text,
mainImage: mainImage
},
date: date || null,
image: absUrl(imgSrc),
sourcePage: page,
sourceUrl: url,
});
});
// "sourceUrl": "https://ladsawai.go.th/public/list/data/index/menu/1559/page/1"
// detailRef: https://ladsawai.go.th/public/list/data/detail/id/3826/menu/1559/page/1
// /public/list/data/detail/id/3826/menu/1559/page/1
const output = {
source: url,
scrapedAt: new Date().toISOString(),
menuId,
page,
count: items.length,
items,
};
const outJson = path.join(OUT, `list-menu-${menuId}-page-${page}.json`);
fs.writeFileSync(outJson, JSON.stringify(output, null, 2), "utf8");
console.log(`✅ page ${page} -> items ${items.length}`);
return items;
}
(function main() {
const menuId = 1402; // กิจกรรม
const totalPages = 1;
const all = [];
const seen = new Set();
// ถ้าไม่อยากให้มี HTML 53 ไฟล์ ให้เป็น false
const saveHtml = true;
for (let page = 1; page <= totalPages; page++) {
const items = scrapeOnePage(menuId, page, saveHtml);
// รวม + กันซ้ำ
for (const it of items) {
const key = `${it.title}|${it.date || ""}|${it.image || ""}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(it);
}
}
const merged = {
menuId,
totalPages,
scrapedAt: new Date().toISOString(),
totalItems: all.length,
items: all,
};
const outAll = path.join(OUT, `list-menu-${menuId}-all.json`);
fs.writeFileSync(outAll, JSON.stringify(merged, null, 2), "utf8");
console.log("✅ Saved merged JSON:", outAll);
console.log("✅ Total unique items:", all.length);
})();