/** * 普通网站用爬虫 * 抓取规则: * 1. 解析标签 * 2. 解析标签里的分享图片属性,任何property里包含og:image的标签属性content,参考:https://ogp.me/ * 3. 如果上述步骤都没有解析道图片,则从中解析所有的标签,默认抓取第一张图片,并尝试抓取加载完成的图片宽度>=300px的 */ import Hero from '@ulixee/hero'; import HeroBot from './HeroBot.mjs'; import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs'; import common from '../lib/common.mjs'; class WebCrawler extends HeroBot { async scrap(url) { let data = {url: url, done: false}; //use iphone if (this.ua == 'mob') { configs.userAgent = configs.userAgents.iphone_safari; configs.viewport = configs.viewports.mob; } let options = { userAgent: configs.userAgent, //default mac os viewport: configs.viewport, }; options = common.mergeConfigs(configs.botOptions, options); if (this.heroServer) { options.connectionToCore = this.heroServer; } const hero = new Hero(options); try { hero.use(ClientLogPlugin); //开启log await hero.goto(url, configs.heroBotOptions); //等待所有内容加载完成 const tab = await hero.activeTab; await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); let rnd_secods = 10 + parseInt(Math.random() * 10); console.log("Sleep %s seconds...", rnd_secods); await common.delay(rnd_secods); //解析网页HTML数据 data.title = await hero.document.title; //封面图抓取 //1. 解析meta const elems = await hero.document.querySelectorAll('meta'); let meta_name = ''; for (const elem of elems) { meta_name = await elem.getAttribute('property'); if (!meta_name) {continue;} meta_name = meta_name.toLowerCase(); if (!data.cover && meta_name.indexOf('og:image') > -1) { data.cover = await elem.getAttribute('content'); }else if (!data.cover && meta_name.indexOf('image') > -1) { let contentStr = await elem.getAttribute('content'); if (common.isUrl(contentStr)) { data.cover = contentStr; } }else if (meta_name.indexOf('og:title') > -1) { data.title = await elem.getAttribute('content'); } } //2. 解析 if (!data.cover) { let minNaturalWidth = configs.minImageNaturalWidth ? configs.minImageNaturalWidth : 50; const imgElems = await hero.querySelectorAll('img'); if (imgElems) { data.cover = await imgElems[0].src; for (const imgEl of imgElems) { //console.log('Completed: %s, naturalWidth: %s, width: %s', await imgEl.complete, await imgEl.naturalWidth, await imgEl.width); if (await imgEl.complete && await imgEl.naturalWidth >= minNaturalWidth) { data.cover = await imgEl.src; //console.log('Cover got: %s', data.cover); break; } } } } //get cover image's base64 data if (typeof(data.cover) != 'undefined' && data.cover) { data.cover = common.getAbsoluteUrl(data.cover); const response = await hero.goto(data.cover); const imgBuffer = await response.buffer; //console.log('Cover image fetch done', imgBuffer.toString('base64')); if (imgBuffer) { data.cover_base64 = imgBuffer.toString('base64'); data.cover_type = common.getImageType(data.cover); } } await hero.close(); data.bot = this.name; data.done = true; }catch(error) { console.error("Error got when request %s via hero: %s", url, error); await hero.close(); } return data; } } export default WebCrawler;