add WebCrawler for all websites

1 year ago · 14219c95d1
6 changed files with 149 additions and 4 deletions
--- a/bot/HeroBot.mjs
+++ b/bot/HeroBot.mjs
@ -7,7 +7,7 @@ import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
				@@ -7,7 +7,7 @@ import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
 import common from '../lib/common.mjs';

 class HeroBot {
-    constructor(heroCloudServer) {
+    constructor(heroCloudServer, botName) {
        this.heroServer = heroCloudServer ? heroCloudServer : '';

        this.supportedBots = {
@ -15,9 +15,10 @@ class HeroBot {
				@@ -15,9 +15,10 @@ class HeroBot {
            kuaishou: 'https://www.kuaishou.com',
            xigua: 'https://www.ixigua.com',
            bilibili: 'https://www.bilibili.com',
+            webcrawler: 'for_all_web_sites',
        };

-        this.name = '';
+        this.name = typeof(botName) != 'undefined' && botName ? botName : '';

        const __filename = fileURLToPath(import.meta.url);
        this.root = path.dirname(__filename);
--- a/bot/WebCrawler.mjs
+++ b/bot/WebCrawler.mjs
@ -0,0 +1,115 @@
				@@ -0,0 +1,115 @@
+/**
+ * 普通网站用爬虫
+ * 抓取规则：
+ * 1. 解析<title></title>标签
+ * 2. 解析<meta>标签里的分享图片属性，任何property里包含og:image的标签属性content，参考：https://ogp.me/
+ * 3. 如果上述步骤都没有解析道图片，则从<body></body>中解析所有的<img>标签，默认抓取第一张图片，并尝试抓取加载完成的图片宽度>=300px的
+ */
+import Hero from '@ulixee/hero';
+import HeroBot from './HeroBot.mjs';
+import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
+import common from '../lib/common.mjs';
+
+class WebCrawler extends HeroBot {
+
+    async scrap(url) {
+        let data = {url: url, done: false};
+
+        //use iphone
+        if (this.ua == 'mob') {
+            configs.userAgent = configs.userAgents.iphone_safari;
+            configs.viewport = configs.viewports.mob;
+        }
+
+        let options = {
+            userAgent: configs.userAgent,     //default mac os
+            viewport: configs.viewport,
+        };
+
+        options = common.mergeConfigs(configs.botOptions, options);
+
+        if (this.heroServer) {
+            options.connectionToCore = this.heroServer;
+        }
+
+        const hero = new Hero(options);
+
+        try {
+            hero.use(ClientLogPlugin);          //开启log
+            await hero.goto(url, configs.heroBotOptions);
+
+            //等待所有内容加载完成
+            const tab = await hero.activeTab;
+            await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
+
+            //解析网页HTML数据
+            data.title = await hero.document.title;
+
+            //封面图抓取
+
+            //1. 解析meta
+            const elems = await hero.document.querySelectorAll('meta');
+            let meta_name = '';
+            for (const elem of elems) {
+                meta_name = await elem.getAttribute('property');
+                if (!meta_name) {continue;}
+                meta_name = meta_name.toLowerCase();
+                if (!data.cover && meta_name.indexOf('og:image') > -1) {
+                    data.cover = await elem.getAttribute('content');
+                }else if (!data.cover && meta_name.indexOf('image') > -1) {
+                    let contentStr = await elem.getAttribute('content');
+                    if (common.isUrl(contentStr)) {
+                        data.cover = contentStr;
+                    }
+                }else if (meta_name.indexOf('og:title') > -1) {
+                    data.title = await elem.getAttribute('content');
+                }
+            }
+
+            //2. <img>解析
+            if (!data.cover) {
+                let minNaturalWidth = configs.minImageNaturalWidth ? configs.minImageNaturalWidth : 50;
+                const imgElems = await hero.querySelectorAll('img');
+                if (imgElems) {
+                    data.cover = await imgElems[0].src;
+
+                    for (const imgEl of imgElems) {
+                        //console.log('Completed: %s, naturalWidth: %s, width: %s', await imgEl.complete, await imgEl.naturalWidth, await imgEl.width);
+                        if (await imgEl.complete && await imgEl.naturalWidth >= minNaturalWidth) {
+                            data.cover = await imgEl.src;
+                            //console.log('Cover got: %s', data.cover);
+                            break;
+                        }
+                    }
+                }
+            }
+
+
+            //get cover image's base64 data
+            if (typeof(data.cover) != 'undefined' && data.cover) {
+                data.cover = common.getAbsoluteUrl(data.cover);
+
+                const response = await hero.goto(data.cover);
+                const imgBuffer = await response.buffer;
+                //console.log('Cover image fetch done', imgBuffer.toString('base64'));
+                if (imgBuffer) {
+                    data.cover_base64 = imgBuffer.toString('base64');
+                    data.cover_type = common.getImageType(data.cover);
+                }
+            }
+
+            await hero.close();
+
+            data.bot = this.name;
+            data.done = true;
+        }catch(error) {
+            console.error("Error got when request %s via hero: %s", url, error);
+            await hero.close();
+        }
+
+        return data;
+    }
+
+}
+
+export default WebCrawler;
--- a/config.mjs
+++ b/config.mjs
@ -34,6 +34,8 @@ let configs = {
				@@ -34,6 +34,8 @@ let configs = {
        height: 900
    },

+    minImageNaturalWidth: 50,      //从<img>标签抓取封面图时的图片原始尺寸最小宽度
+
    //可选项参考官方文档：https://ulixee.org/docs/hero/basic-client/hero
    botOptions: {
        showChrome: false,
--- a/lib/common.mjs
+++ b/lib/common.mjs
@ -27,7 +27,7 @@ export default {
				@@ -27,7 +27,7 @@ export default {
    },

    getBotName: function(url) {
-        let botName = '';
+        let botName = 'website';

        if (/douyin\.com/ig.test(url)) {
            botName = 'douyin';
@ -53,7 +53,7 @@ export default {
				@@ -53,7 +53,7 @@ export default {
    getImageType: function(url) {
        let imgType = 'jpeg';

-        if (/\.jpe?g/ig.test(url)) {
+        if (/\.jp(e)?g/ig.test(url)) {
            imgType = 'jpeg';
        }else if (/\.png/ig.test(url)) {
            imgType = 'png';
@ -66,6 +66,10 @@ export default {
				@@ -66,6 +66,10 @@ export default {
        return imgType;
    },

+    isUrl: function(url) {
+        return /^http(s)?:\/\/.+/ig.test(url);
+    },
+
    loadCustomizeConfig: async function(configFileName) {
        let configs = {};

--- a/spider.mjs
+++ b/spider.mjs
@ -81,6 +81,9 @@ import path from 'node:path';
				@@ -81,6 +81,9 @@ import path from 'node:path';
            case 'bilibili':
                bot = new Bilibili(heroCloudServer);
                break;
+            default:
+                bot = = new WebCrawler(heroCloudServer, 'webcrawler');
+                break;
        }

        if (bot) {
--- a/test/scrap_test.mjs
+++ b/test/scrap_test.mjs
@ -2,6 +2,7 @@ import Douyin from '../bot/Douyin.mjs';
				@@ -2,6 +2,7 @@ import Douyin from '../bot/Douyin.mjs';
 import Kuaishou from '../bot/Kuaishou.mjs';
 import Xigua from '../bot/Xigua.mjs';
 import Bilibili from '../bot/Bilibili.mjs';
+import WebCrawler from '../bot/WebCrawler.mjs';
 import getConfigs from '../config.mjs';

 (async () => {
@ -103,6 +104,25 @@ import getConfigs from '../config.mjs';
				@@ -103,6 +104,25 @@ import getConfigs from '../config.mjs';

            break;

+        default:
+            //普通网站
+            url = 'https://www.baidu.com';
+            url = 'https://www.zhihu.com';
+            url = 'https://ogp.me/';
+
+            configs.heroTabOptions.timeoutMs = 20000;   //所有内容加载完成超时
+
+            configs.userAgent = configs.userAgents.mac_chrome;
+            configs.viewport = configs.viewports.pc;
+
+            console.log('Hero配置', configs);
+            const crawler = new WebCrawler(heroCloudServer, 'webcrawler');
+            console.log('请求中: %s ...', url);
+            data = await crawler.scrap(url);
+            console.log("解析结果:\n%s", JSON.stringify(data));
+
+            break;
+
    }

    process.exit(0);