diff --git a/bot/HeroBot.mjs b/bot/HeroBot.mjs
index 49a8ac0..2d58b31 100644
--- a/bot/HeroBot.mjs
+++ b/bot/HeroBot.mjs
@@ -7,7 +7,7 @@ import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
import common from '../lib/common.mjs';
class HeroBot {
- constructor(heroCloudServer) {
+ constructor(heroCloudServer, botName) {
this.heroServer = heroCloudServer ? heroCloudServer : '';
this.supportedBots = {
@@ -15,9 +15,10 @@ class HeroBot {
kuaishou: 'https://www.kuaishou.com',
xigua: 'https://www.ixigua.com',
bilibili: 'https://www.bilibili.com',
+ webcrawler: 'for_all_web_sites',
};
- this.name = '';
+ this.name = typeof(botName) != 'undefined' && botName ? botName : '';
const __filename = fileURLToPath(import.meta.url);
this.root = path.dirname(__filename);
diff --git a/bot/WebCrawler.mjs b/bot/WebCrawler.mjs
new file mode 100644
index 0000000..78d784e
--- /dev/null
+++ b/bot/WebCrawler.mjs
@@ -0,0 +1,115 @@
+/**
+ * 普通网站用爬虫
+ * 抓取规则:
+ * 1. 解析
标签
+ * 2. 解析标签里的分享图片属性,任何property里包含og:image的标签属性content,参考:https://ogp.me/
+ * 3. 如果上述步骤都没有解析道图片,则从中解析所有的标签,默认抓取第一张图片,并尝试抓取加载完成的图片宽度>=300px的
+ */
+import Hero from '@ulixee/hero';
+import HeroBot from './HeroBot.mjs';
+import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
+import common from '../lib/common.mjs';
+
+class WebCrawler extends HeroBot {
+
+ async scrap(url) {
+ let data = {url: url, done: false};
+
+ //use iphone
+ if (this.ua == 'mob') {
+ configs.userAgent = configs.userAgents.iphone_safari;
+ configs.viewport = configs.viewports.mob;
+ }
+
+ let options = {
+ userAgent: configs.userAgent, //default mac os
+ viewport: configs.viewport,
+ };
+
+ options = common.mergeConfigs(configs.botOptions, options);
+
+ if (this.heroServer) {
+ options.connectionToCore = this.heroServer;
+ }
+
+ const hero = new Hero(options);
+
+ try {
+ hero.use(ClientLogPlugin); //开启log
+ await hero.goto(url, configs.heroBotOptions);
+
+ //等待所有内容加载完成
+ const tab = await hero.activeTab;
+ await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
+
+ //解析网页HTML数据
+ data.title = await hero.document.title;
+
+ //封面图抓取
+
+ //1. 解析meta
+ const elems = await hero.document.querySelectorAll('meta');
+ let meta_name = '';
+ for (const elem of elems) {
+ meta_name = await elem.getAttribute('property');
+ if (!meta_name) {continue;}
+ meta_name = meta_name.toLowerCase();
+ if (!data.cover && meta_name.indexOf('og:image') > -1) {
+ data.cover = await elem.getAttribute('content');
+ }else if (!data.cover && meta_name.indexOf('image') > -1) {
+ let contentStr = await elem.getAttribute('content');
+ if (common.isUrl(contentStr)) {
+ data.cover = contentStr;
+ }
+ }else if (meta_name.indexOf('og:title') > -1) {
+ data.title = await elem.getAttribute('content');
+ }
+ }
+
+ //2. 解析
+ if (!data.cover) {
+ let minNaturalWidth = configs.minImageNaturalWidth ? configs.minImageNaturalWidth : 50;
+ const imgElems = await hero.querySelectorAll('img');
+ if (imgElems) {
+ data.cover = await imgElems[0].src;
+
+ for (const imgEl of imgElems) {
+ //console.log('Completed: %s, naturalWidth: %s, width: %s', await imgEl.complete, await imgEl.naturalWidth, await imgEl.width);
+ if (await imgEl.complete && await imgEl.naturalWidth >= minNaturalWidth) {
+ data.cover = await imgEl.src;
+ //console.log('Cover got: %s', data.cover);
+ break;
+ }
+ }
+ }
+ }
+
+
+ //get cover image's base64 data
+ if (typeof(data.cover) != 'undefined' && data.cover) {
+ data.cover = common.getAbsoluteUrl(data.cover);
+
+ const response = await hero.goto(data.cover);
+ const imgBuffer = await response.buffer;
+ //console.log('Cover image fetch done', imgBuffer.toString('base64'));
+ if (imgBuffer) {
+ data.cover_base64 = imgBuffer.toString('base64');
+ data.cover_type = common.getImageType(data.cover);
+ }
+ }
+
+ await hero.close();
+
+ data.bot = this.name;
+ data.done = true;
+ }catch(error) {
+ console.error("Error got when request %s via hero: %s", url, error);
+ await hero.close();
+ }
+
+ return data;
+ }
+
+}
+
+export default WebCrawler;
diff --git a/config.mjs b/config.mjs
index 9e5cdf0..b67fbe5 100644
--- a/config.mjs
+++ b/config.mjs
@@ -34,6 +34,8 @@ let configs = {
height: 900
},
+ minImageNaturalWidth: 50, //从标签抓取封面图时的图片原始尺寸最小宽度
+
//可选项参考官方文档:https://ulixee.org/docs/hero/basic-client/hero
botOptions: {
showChrome: false,
diff --git a/lib/common.mjs b/lib/common.mjs
index e3169a7..e2284e5 100644
--- a/lib/common.mjs
+++ b/lib/common.mjs
@@ -27,7 +27,7 @@ export default {
},
getBotName: function(url) {
- let botName = '';
+ let botName = 'website';
if (/douyin\.com/ig.test(url)) {
botName = 'douyin';
@@ -53,7 +53,7 @@ export default {
getImageType: function(url) {
let imgType = 'jpeg';
- if (/\.jpe?g/ig.test(url)) {
+ if (/\.jp(e)?g/ig.test(url)) {
imgType = 'jpeg';
}else if (/\.png/ig.test(url)) {
imgType = 'png';
@@ -66,6 +66,10 @@ export default {
return imgType;
},
+ isUrl: function(url) {
+ return /^http(s)?:\/\/.+/ig.test(url);
+ },
+
loadCustomizeConfig: async function(configFileName) {
let configs = {};
diff --git a/spider.mjs b/spider.mjs
index 6d61c41..a7f3f15 100644
--- a/spider.mjs
+++ b/spider.mjs
@@ -81,6 +81,9 @@ import path from 'node:path';
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
+ default:
+ bot = = new WebCrawler(heroCloudServer, 'webcrawler');
+ break;
}
if (bot) {
diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs
index 531958d..bf2b212 100644
--- a/test/scrap_test.mjs
+++ b/test/scrap_test.mjs
@@ -2,6 +2,7 @@ import Douyin from '../bot/Douyin.mjs';
import Kuaishou from '../bot/Kuaishou.mjs';
import Xigua from '../bot/Xigua.mjs';
import Bilibili from '../bot/Bilibili.mjs';
+import WebCrawler from '../bot/WebCrawler.mjs';
import getConfigs from '../config.mjs';
(async () => {
@@ -103,6 +104,25 @@ import getConfigs from '../config.mjs';
break;
+ default:
+ //普通网站
+ url = 'https://www.baidu.com';
+ url = 'https://www.zhihu.com';
+ url = 'https://ogp.me/';
+
+ configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时
+
+ configs.userAgent = configs.userAgents.mac_chrome;
+ configs.viewport = configs.viewports.pc;
+
+ console.log('Hero配置', configs);
+ const crawler = new WebCrawler(heroCloudServer, 'webcrawler');
+ console.log('请求中: %s ...', url);
+ data = await crawler.scrap(url);
+ console.log("解析结果:\n%s", JSON.stringify(data));
+
+ break;
+
}
process.exit(0);