From e186750d67c156ba301be22c67367d34625808b2 Mon Sep 17 00:00:00 2001 From: filesite Date: Fri, 10 May 2024 19:27:51 +0800 Subject: [PATCH] add iphone html pars for douyin --- bot/Douyin.mjs | 43 ++++++++++++++++------ bot/HeroBot.mjs | 2 +- config.mjs | 4 +-- test/douyin_test.mjs | 86 +++++++++++++++++++++++++++++++++++--------- test/scrap_test.mjs | 10 +++--- 5 files changed, 110 insertions(+), 35 deletions(-) diff --git a/bot/Douyin.mjs b/bot/Douyin.mjs index e599c2c..ccad328 100644 --- a/bot/Douyin.mjs +++ b/bot/Douyin.mjs @@ -8,8 +8,12 @@ class Douyin extends HeroBot { async scrap(url) { let data = {url: url, done: false}; + //use iphone + configs.userAgent = configs.userAgents.iphone_safari; + configs.viewport = configs.viewports.mob; + let options = { - userAgent: configs.userAgent, + userAgent: configs.userAgent, //default mac os viewport: configs.viewport, }; @@ -36,20 +40,26 @@ class Douyin extends HeroBot { //等待所有内容加载完成 const tab = await hero.activeTab; - await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); - await hero.waitForState({ - all(assert) { - assert( - hero.document.title, - text => text != '', - ); - } - }, {timeoutMs: configs.heroTabOptions.timeoutMs}); + + //for mob + await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + + //for pc + //await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + //await hero.waitForState({ + // all(assert) { + // assert( + // hero.document.title, + // text => text != '', + // ); + // } + //}, {timeoutMs: configs.heroTabOptions.timeoutMs}); //解析网页HTML数据 data.title = await hero.document.title; - //data.url = await hero.url; + //pc版网页解析 + /* const elems = await hero.document.querySelectorAll('meta'); let meta_name = ''; for (const elem of elems) { @@ -63,6 +73,17 @@ class Douyin extends HeroBot { data.title = await elem.getAttribute('content'); } } + */ + + //手机版网页解析 + const imgElem = await hero.querySelector('.video-container img.poster'); + if (!imgElem) { + console.error('HTML解析出错,找不到封面图', data); + await hero.close(); + return false; + } + + data.cover = await imgElem.src; //get cover image's base64 data if (typeof(data.cover) != 'undefined' && data.cover) { diff --git a/bot/HeroBot.mjs b/bot/HeroBot.mjs index db12313..90eb688 100644 --- a/bot/HeroBot.mjs +++ b/bot/HeroBot.mjs @@ -58,7 +58,7 @@ class HeroBot { //等待所有内容加载完成 const tab = await hero.activeTab; - await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); + await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); //保存profile const latestUserProfile = await hero.exportUserProfile(); diff --git a/config.mjs b/config.mjs index b3e2cb1..47123b9 100644 --- a/config.mjs +++ b/config.mjs @@ -65,7 +65,7 @@ let configs = { //常用浏览器user-agent userAgents: { - iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', + iphone_safari: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', @@ -86,4 +86,4 @@ async function getConfig() { return configs; } -export default getConfig; \ No newline at end of file +export default getConfig; diff --git a/test/douyin_test.mjs b/test/douyin_test.mjs index 6fd6307..b7c1a58 100644 --- a/test/douyin_test.mjs +++ b/test/douyin_test.mjs @@ -2,15 +2,33 @@ import Hero from '@ulixee/hero'; (async () => { const hero = new Hero({ - connectionToCore: 'ws://192.168.3.13:1818', - userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1' + connectionToCore: 'ws://127.0.0.1:1818', + + //iphone 12 Pro + userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', + viewport: { + width: 390, + height: 844 + }, + + //mac mini + //userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + //viewport: { + // width: 1440, + // height: 900 + //}, + + showChrome: true, + showChromeInteractions: true, + showDevtools: true, + showChromeAlive: true, }); - const url = 'https://v.douyin.com/iJr1NsJJ/'; + const url = 'https://v.douyin.com/i2PBaR5B/'; console.log("请求 %s 中。。。", url); await hero.goto(url, { - timeoutMs: 10000, - referrer: 'https://wechat.com', + timeoutMs: 60000, + referrer: '', }); //const title = await hero.document.title; @@ -18,20 +36,54 @@ import Hero from '@ulixee/hero'; //等待所有内容加载完成 const tab = await hero.activeTab; - await tab.waitForLoad('AllContentLoaded', {timeoutMs: 5000}); + + //mac mini + //await hero.waitForPaintingStable(); + //await tab.waitForLoad('AllContentLoaded', {timeoutMs: 30000}); + await tab.waitForLoad('DomContentLoaded', {timeoutMs: 30000}); + + //await hero.waitForState({ + // all(assert) { + // assert( + // hero.detach( hero.document.querySelectorAll('img.poster') ), + // els => els && els.length > 0, + // ); + // } + // }, {timeoutMs: 20000}); + //console.log('poster封面图标签已经准备好'); + console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded); - const elems = await hero.detach( hero.document.querySelectorAll('meta') ); - console.log('数量', elems.length); - let meta_name = ''; - for (const elem of elems) { - meta_name = elem.getAttribute('name'); - if (!meta_name) {continue;} - meta_name = meta_name.toLowerCase(); - if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) { - console.log('meta name %s, content: %s', meta_name, elem.getAttribute('content')); - } - } + + //解析网页HTML数据 + let doc_url = await hero.document.location.href; + console.log('网址', doc_url); + + //let doc_html = await hero.document.body.innerHTML; + //console.log('网页内容', doc_html); + + let title = await hero.document.title; + console.log('网页标题', title); + + const elem = await hero.querySelector('.video-container img.poster'); + let imgUrl = ''; + imgUrl = await elem.src; + console.log('post image url: %s', imgUrl); + + //const elems = await hero.detach( hero.document.querySelectorAll('meta') ); + //const elems = await hero.document.querySelectorAll('meta'); + ////console.log('数量', await elems.length); + //let meta_name = ''; + //for (const elem in elems) { + // meta_name = await elem.getAttribute('name'); + // if (!meta_name) {continue;} + // console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content')); + + // meta_name = meta_name.toLowerCase(); + // if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) { + // console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content')); + // } + //} await hero.close(); })().catch(error => { diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs index 8df88a9..ea9102d 100644 --- a/test/scrap_test.mjs +++ b/test/scrap_test.mjs @@ -2,24 +2,26 @@ import Douyin from '../bot/Douyin.mjs'; import Kuaishou from '../bot/Kuaishou.mjs'; import Xigua from '../bot/Xigua.mjs'; import Bilibili from '../bot/Bilibili.mjs'; -import configs from '../config.mjs'; +import getConfigs from '../config.mjs'; (async () => { + //设置configs为全局变量 + global.configs = await getConfigs(); + let test_bot = 'douyin'; if (process.argv.length == 3) { test_bot = process.argv[2]; } console.log('当前测试Bot:%s', test_bot); - const heroCloudServer = 'ws://192.168.3.13:1818'; + const heroCloudServer = 'ws://127.0.0.1:1818'; let url = '', data = {}; switch(test_bot) { case 'douyin': //抖音测试 - url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc - url = 'https://v.douyin.com/i8sEyb6/'; //mob and pc + url = 'https://v.douyin.com/i2PBaR5B/'; //mob and pc configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时