From 14219c95d1d56df850866094d215ab3fe4763df4 Mon Sep 17 00:00:00 2001 From: filesite Date: Thu, 16 May 2024 07:38:32 +0800 Subject: [PATCH] add WebCrawler for all websites --- bot/HeroBot.mjs | 5 +- bot/WebCrawler.mjs | 115 ++++++++++++++++++++++++++++++++++++++++++++ config.mjs | 2 + lib/common.mjs | 8 ++- spider.mjs | 3 ++ test/scrap_test.mjs | 20 ++++++++ 6 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 bot/WebCrawler.mjs diff --git a/bot/HeroBot.mjs b/bot/HeroBot.mjs index 49a8ac0..2d58b31 100644 --- a/bot/HeroBot.mjs +++ b/bot/HeroBot.mjs @@ -7,7 +7,7 @@ import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs'; import common from '../lib/common.mjs'; class HeroBot { - constructor(heroCloudServer) { + constructor(heroCloudServer, botName) { this.heroServer = heroCloudServer ? heroCloudServer : ''; this.supportedBots = { @@ -15,9 +15,10 @@ class HeroBot { kuaishou: 'https://www.kuaishou.com', xigua: 'https://www.ixigua.com', bilibili: 'https://www.bilibili.com', + webcrawler: 'for_all_web_sites', }; - this.name = ''; + this.name = typeof(botName) != 'undefined' && botName ? botName : ''; const __filename = fileURLToPath(import.meta.url); this.root = path.dirname(__filename); diff --git a/bot/WebCrawler.mjs b/bot/WebCrawler.mjs new file mode 100644 index 0000000..78d784e --- /dev/null +++ b/bot/WebCrawler.mjs @@ -0,0 +1,115 @@ +/** + * 普通网站用爬虫 + * 抓取规则: + * 1. 解析标签 + * 2. 解析标签里的分享图片属性,任何property里包含og:image的标签属性content,参考:https://ogp.me/ + * 3. 如果上述步骤都没有解析道图片,则从中解析所有的标签,默认抓取第一张图片,并尝试抓取加载完成的图片宽度>=300px的 + */ +import Hero from '@ulixee/hero'; +import HeroBot from './HeroBot.mjs'; +import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs'; +import common from '../lib/common.mjs'; + +class WebCrawler extends HeroBot { + + async scrap(url) { + let data = {url: url, done: false}; + + //use iphone + if (this.ua == 'mob') { + configs.userAgent = configs.userAgents.iphone_safari; + configs.viewport = configs.viewports.mob; + } + + let options = { + userAgent: configs.userAgent, //default mac os + viewport: configs.viewport, + }; + + options = common.mergeConfigs(configs.botOptions, options); + + if (this.heroServer) { + options.connectionToCore = this.heroServer; + } + + const hero = new Hero(options); + + try { + hero.use(ClientLogPlugin); //开启log + await hero.goto(url, configs.heroBotOptions); + + //等待所有内容加载完成 + const tab = await hero.activeTab; + await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + + //解析网页HTML数据 + data.title = await hero.document.title; + + //封面图抓取 + + //1. 解析meta + const elems = await hero.document.querySelectorAll('meta'); + let meta_name = ''; + for (const elem of elems) { + meta_name = await elem.getAttribute('property'); + if (!meta_name) {continue;} + meta_name = meta_name.toLowerCase(); + if (!data.cover && meta_name.indexOf('og:image') > -1) { + data.cover = await elem.getAttribute('content'); + }else if (!data.cover && meta_name.indexOf('image') > -1) { + let contentStr = await elem.getAttribute('content'); + if (common.isUrl(contentStr)) { + data.cover = contentStr; + } + }else if (meta_name.indexOf('og:title') > -1) { + data.title = await elem.getAttribute('content'); + } + } + + //2. 解析 + if (!data.cover) { + let minNaturalWidth = configs.minImageNaturalWidth ? configs.minImageNaturalWidth : 50; + const imgElems = await hero.querySelectorAll('img'); + if (imgElems) { + data.cover = await imgElems[0].src; + + for (const imgEl of imgElems) { + //console.log('Completed: %s, naturalWidth: %s, width: %s', await imgEl.complete, await imgEl.naturalWidth, await imgEl.width); + if (await imgEl.complete && await imgEl.naturalWidth >= minNaturalWidth) { + data.cover = await imgEl.src; + //console.log('Cover got: %s', data.cover); + break; + } + } + } + } + + + //get cover image's base64 data + if (typeof(data.cover) != 'undefined' && data.cover) { + data.cover = common.getAbsoluteUrl(data.cover); + + const response = await hero.goto(data.cover); + const imgBuffer = await response.buffer; + //console.log('Cover image fetch done', imgBuffer.toString('base64')); + if (imgBuffer) { + data.cover_base64 = imgBuffer.toString('base64'); + data.cover_type = common.getImageType(data.cover); + } + } + + await hero.close(); + + data.bot = this.name; + data.done = true; + }catch(error) { + console.error("Error got when request %s via hero: %s", url, error); + await hero.close(); + } + + return data; + } + +} + +export default WebCrawler; diff --git a/config.mjs b/config.mjs index 9e5cdf0..b67fbe5 100644 --- a/config.mjs +++ b/config.mjs @@ -34,6 +34,8 @@ let configs = { height: 900 }, + minImageNaturalWidth: 50, //从标签抓取封面图时的图片原始尺寸最小宽度 + //可选项参考官方文档:https://ulixee.org/docs/hero/basic-client/hero botOptions: { showChrome: false, diff --git a/lib/common.mjs b/lib/common.mjs index e3169a7..e2284e5 100644 --- a/lib/common.mjs +++ b/lib/common.mjs @@ -27,7 +27,7 @@ export default { }, getBotName: function(url) { - let botName = ''; + let botName = 'website'; if (/douyin\.com/ig.test(url)) { botName = 'douyin'; @@ -53,7 +53,7 @@ export default { getImageType: function(url) { let imgType = 'jpeg'; - if (/\.jpe?g/ig.test(url)) { + if (/\.jp(e)?g/ig.test(url)) { imgType = 'jpeg'; }else if (/\.png/ig.test(url)) { imgType = 'png'; @@ -66,6 +66,10 @@ export default { return imgType; }, + isUrl: function(url) { + return /^http(s)?:\/\/.+/ig.test(url); + }, + loadCustomizeConfig: async function(configFileName) { let configs = {}; diff --git a/spider.mjs b/spider.mjs index 6d61c41..a7f3f15 100644 --- a/spider.mjs +++ b/spider.mjs @@ -81,6 +81,9 @@ import path from 'node:path'; case 'bilibili': bot = new Bilibili(heroCloudServer); break; + default: + bot = = new WebCrawler(heroCloudServer, 'webcrawler'); + break; } if (bot) { diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs index 531958d..bf2b212 100644 --- a/test/scrap_test.mjs +++ b/test/scrap_test.mjs @@ -2,6 +2,7 @@ import Douyin from '../bot/Douyin.mjs'; import Kuaishou from '../bot/Kuaishou.mjs'; import Xigua from '../bot/Xigua.mjs'; import Bilibili from '../bot/Bilibili.mjs'; +import WebCrawler from '../bot/WebCrawler.mjs'; import getConfigs from '../config.mjs'; (async () => { @@ -103,6 +104,25 @@ import getConfigs from '../config.mjs'; break; + default: + //普通网站 + url = 'https://www.baidu.com'; + url = 'https://www.zhihu.com'; + url = 'https://ogp.me/'; + + configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 + + configs.userAgent = configs.userAgents.mac_chrome; + configs.viewport = configs.viewports.pc; + + console.log('Hero配置', configs); + const crawler = new WebCrawler(heroCloudServer, 'webcrawler'); + console.log('请求中: %s ...', url); + data = await crawler.scrap(url); + console.log("解析结果:\n%s", JSON.stringify(data)); + + break; + } process.exit(0);