From 866e59c2d59f23afb63107522c0d34bd3f54e35f Mon Sep 17 00:00:00 2001 From: filesite Date: Mon, 11 Sep 2023 17:46:04 +0800 Subject: [PATCH] pc and mob share link test done for douyin, kuaishou and xigua --- bot/Douyin.mjs | 39 +++++++++++++++++++++++++++---------- bot/HeroBot.mjs | 47 ++++++++++++++++++++++++++++++++++++++++----- bot/Kuaishou.mjs | 12 +++++++++--- bot/Xigua.mjs | 18 ++++++++++++----- config.mjs | 19 ++++++++++++++++-- test/scrap_test.mjs | 32 +++++++++++++++++++++--------- 6 files changed, 133 insertions(+), 34 deletions(-) diff --git a/bot/Douyin.mjs b/bot/Douyin.mjs index 7452913..d9746c6 100644 --- a/bot/Douyin.mjs +++ b/bot/Douyin.mjs @@ -1,45 +1,64 @@ import Hero from '@ulixee/hero'; import configs from '../config.mjs'; +import HeroBot from './HeroBot.mjs'; -class Douyin { - constructor(heroCloudServer) { - this.heroServer = heroCloudServer ? heroCloudServer : ''; - } +class Douyin extends HeroBot { async scrap(url) { let data = {}; try { - let options = {}; + let options = { + userAgent: configs.userAgent, + viewport: configs.viewport + }; + if (this.heroServer) { options.connectionToCore = this.heroServer; } + const profile = await this.init('douyin'); + if (profile) { + options.userProfile = this.fixCookies(profile); + } + const hero = new Hero(options); await hero.goto(url, configs.heroBotOptions); //等待所有内容加载完成 const tab = await hero.activeTab; await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + await hero.waitForState({ + all(assert) { + assert( + hero.document.title, + text => text != '', + ); + } + }); //解析网页HTML数据 - const elems = await hero.detach( hero.document.querySelectorAll('meta') ); + data.title = await hero.document.title; + //data.url = await hero.url; + + const elems = await hero.document.querySelectorAll('meta'); let meta_name = ''; for (const elem of elems) { - meta_name = elem.getAttribute('name'); + meta_name = await elem.getAttribute('name'); if (!meta_name) {continue;} meta_name = meta_name.toLowerCase(); + //console.log('meta', meta_name); if (meta_name.indexOf('video_cover_image_url') > -1) { - data.cover = elem.getAttribute('content'); + data.cover = await elem.getAttribute('content'); }else if (meta_name.indexOf('video_title') > -1) { - data.title = elem.getAttribute('content'); + data.title = await elem.getAttribute('content'); } } await hero.close(); }catch(error) { console.error("Error got when request %s via hero: %s", url, error); - }; + } return data; } diff --git a/bot/HeroBot.mjs b/bot/HeroBot.mjs index 0496a1a..40385f0 100644 --- a/bot/HeroBot.mjs +++ b/bot/HeroBot.mjs @@ -28,7 +28,10 @@ class HeroBot { try { this.name = botName; - let options = {}; + let options = { + userAgent: configs.userAgent, + viewport: configs.viewport + }; if (this.heroServer) { options.connectionToCore = this.heroServer; @@ -37,10 +40,11 @@ class HeroBot { const profilePath = path.resolve('../tmp/', `profile_${botName}.json`); if (fs.existsSync(profilePath) != false) { const json = fs.readFileSync(profilePath, { encoding: 'utf8' }); - options.userProfile = JSON.parse(json); - return options.userProfile; + return JSON.parse(json); } + console.log('Hero init配置', configs); + const hero = new Hero(options); await hero.goto(base_url, configs.heroBotOptions); @@ -50,18 +54,51 @@ class HeroBot { //保存profile const latestUserProfile = await hero.exportUserProfile(); - fs.writeFileSync(profilePath, JSON.stringify(latestUserProfile, null, 2)); + this.saveProfile(latestUserProfile); await hero.close(); return latestUserProfile; }catch(error) { console.error("Error got when request %s via hero: %s", base_url, error); - }; + } return false; } + //保存profile + saveProfile(profile) { + if (this.name == '') {return false;} + + const botName = this.name; + + try { + //保存profile + const profilePath = path.resolve('../tmp/', `profile_${botName}.json`); + fs.writeFileSync(profilePath, JSON.stringify(profile, null, 2)); + }catch(error) { + console.error("Error got when save profile of %s, error detail:\n%s", botName, error); + return false; + } + + return true; + } + + //处理name为空的cookie + fixCookies(profile) { + let fixedProfile = profile; + if (typeof(profile.cookies) == 'undefined') {return profile;} + + const botName = this.name; + for (const index in profile.cookies) { + if (profile.cookies[index].name == '') { + fixedProfile.cookies[index].name = botName; + } + } + + return fixedProfile; + } + } export default HeroBot; diff --git a/bot/Kuaishou.mjs b/bot/Kuaishou.mjs index 20ac7e7..edfd9a2 100644 --- a/bot/Kuaishou.mjs +++ b/bot/Kuaishou.mjs @@ -8,7 +8,10 @@ class Kuaishou extends HeroBot { let data = {}; try { - let options = {}; + let options = { + userAgent: configs.userAgent, + viewport: configs.viewport + }; if (this.heroServer) { options.connectionToCore = this.heroServer; @@ -25,13 +28,16 @@ class Kuaishou extends HeroBot { //等待所有内容加载完成 const tab = await hero.activeTab; await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + await hero.waitForPaintingStable(); //解析网页HTML数据 data.title = await hero.document.title; //data.url = await hero.url; - const elem = await hero.detach( hero.document.querySelector('.video-container-player') ); - data.cover = elem.getAttribute('poster'); + const elem = hero.document.querySelector('.video-container-player'); + if (elem) { + data.cover = await elem.getAttribute('poster'); + } await hero.close(); }catch(error) { diff --git a/bot/Xigua.mjs b/bot/Xigua.mjs index 5ec1522..631c4eb 100644 --- a/bot/Xigua.mjs +++ b/bot/Xigua.mjs @@ -8,7 +8,10 @@ class Xigua extends HeroBot { let data = {}; try { - let options = {}; + let options = { + userAgent: configs.userAgent, + viewport: configs.viewport + }; if (this.heroServer) { options.connectionToCore = this.heroServer; @@ -25,18 +28,23 @@ class Xigua extends HeroBot { //等待所有内容加载完成 const tab = await hero.activeTab; await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + await hero.waitForPaintingStable(); //解析网页HTML数据 - const elems = await hero.detach( hero.document.querySelectorAll('meta') ); + data.title = await hero.document.title; + //data.url = await hero.url; + + const elems = await hero.document.querySelectorAll('meta'); let meta_name = ''; for (const elem of elems) { - meta_name = elem.getAttribute('name'); + meta_name = await elem.getAttribute('name'); if (!meta_name) {continue;} meta_name = meta_name.toLowerCase(); + //console.log('meta', meta_name); if (meta_name.indexOf('og:image') > -1) { - data.cover = elem.getAttribute('content'); + data.cover = await elem.getAttribute('content'); }else if (meta_name.indexOf('og:title') > -1) { - data.title = elem.getAttribute('content'); + data.title = await elem.getAttribute('content'); } } diff --git a/config.mjs b/config.mjs index ee5512e..0bffbed 100644 --- a/config.mjs +++ b/config.mjs @@ -1,10 +1,25 @@ export default { + userAgent: '~ chrome >= 114 && mac', + viewport: { + width: 1440, + height: 900 + }, + + viewports: { + mob: { + width: 375, + height: 667 + }, + pc: { + width: 1440, + height: 900 + }, + }, //请求参数 heroBotOptions: { timeoutMs: 10000, referrer: '', - userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN' }, //网页tab参数 @@ -16,7 +31,7 @@ export default { userAgents: { iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', - mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', } diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs index 00e13df..3d8f29c 100644 --- a/test/scrap_test.mjs +++ b/test/scrap_test.mjs @@ -17,8 +17,16 @@ import configs from '../config.mjs'; case 'douyin': //抖音测试 + url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc + + configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 + + configs.userAgent = configs.userAgents.mac_chrome; + configs.viewport = configs.viewports.pc; + + console.log('Hero配置', configs); + const douyin = new Douyin(heroCloudServer); - url = 'https://v.douyin.com/iJr1NsJJ/'; console.log('请求中: %s ...', url); data = await douyin.scrap(url); console.log("解析结果:\n%s", JSON.stringify(data)); @@ -28,15 +36,17 @@ import configs from '../config.mjs'; case 'kuaishou': //快手测试 + url = 'https://www.kuaishou.com/f/X8FTguiIjZQVwE7'; //pc + //url = 'https://v.kuaishou.com/7zwqe6'; //mob + configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 - configs.heroBotOptions.userAgent = configs.userAgents.iphone_wechat; - //configs.heroBotOptions.userAgent = configs.userAgents.mac_chrome; + + configs.userAgent = configs.userAgents.mac_chrome; + configs.viewport = configs.viewports.pc; console.log('Hero配置', configs); const kuaishou = new Kuaishou(heroCloudServer); - //url = 'https://www.kuaishou.com/f/X8FTguiIjZQVwE7'; - url = 'https://v.kuaishou.com/7zwqe6'; - //url = 'https://www.kuaishou.com/'; + console.log('请求中: %s ...', url); data = await kuaishou.scrap(url); console.log("解析结果:\n%s", JSON.stringify(data)); @@ -46,15 +56,19 @@ import configs from '../config.mjs'; case 'xigua': //西瓜测试 - //url = 'https://www.ixigua.com/7092326495246516749'; //pc url = 'https://v.ixigua.com/ieUaqrFN/'; //mobile + url = 'https://www.ixigua.com/7248225527335813636'; //pc + configs.heroBotOptions.referrer = url; - configs.heroBotOptions.userAgent = configs.userAgents.mac_chrome; + configs.userAgent = configs.userAgents.mac_chrome; + configs.viewport = configs.viewports.pc; + console.log('Hero配置', configs); const xigua = new Xigua(heroCloudServer); console.log('请求中: %s ...', url); - data = await xigua.scrap(`${url}?wid_try=1`); + //data = await xigua.scrap(`${url}?wid_try=1`); + data = await xigua.scrap(url); console.log("解析结果:\n%s", JSON.stringify(data)); break;