Browse Source

add iphone html pars for douyin

master
filesite 7 months ago
parent
commit
e186750d67
  1. 43
      bot/Douyin.mjs
  2. 2
      bot/HeroBot.mjs
  3. 2
      config.mjs
  4. 86
      test/douyin_test.mjs
  5. 10
      test/scrap_test.mjs

43
bot/Douyin.mjs

@ -8,8 +8,12 @@ class Douyin extends HeroBot {
async scrap(url) { async scrap(url) {
let data = {url: url, done: false}; let data = {url: url, done: false};
//use iphone
configs.userAgent = configs.userAgents.iphone_safari;
configs.viewport = configs.viewports.mob;
let options = { let options = {
userAgent: configs.userAgent, userAgent: configs.userAgent, //default mac os
viewport: configs.viewport, viewport: configs.viewport,
}; };
@ -36,20 +40,26 @@ class Douyin extends HeroBot {
//等待所有内容加载完成 //等待所有内容加载完成
const tab = await hero.activeTab; const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await hero.waitForState({ //for mob
all(assert) { await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
assert(
hero.document.title, //for pc
text => text != '', //await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
); //await hero.waitForState({
} // all(assert) {
}, {timeoutMs: configs.heroTabOptions.timeoutMs}); // assert(
// hero.document.title,
// text => text != '',
// );
// }
//}, {timeoutMs: configs.heroTabOptions.timeoutMs});
//解析网页HTML数据 //解析网页HTML数据
data.title = await hero.document.title; data.title = await hero.document.title;
//data.url = await hero.url;
//pc版网页解析
/*
const elems = await hero.document.querySelectorAll('meta'); const elems = await hero.document.querySelectorAll('meta');
let meta_name = ''; let meta_name = '';
for (const elem of elems) { for (const elem of elems) {
@ -63,6 +73,17 @@ class Douyin extends HeroBot {
data.title = await elem.getAttribute('content'); data.title = await elem.getAttribute('content');
} }
} }
*/
//手机版网页解析
const imgElem = await hero.querySelector('.video-container img.poster');
if (!imgElem) {
console.error('HTML解析出错,找不到封面图', data);
await hero.close();
return false;
}
data.cover = await imgElem.src;
//get cover image's base64 data //get cover image's base64 data
if (typeof(data.cover) != 'undefined' && data.cover) { if (typeof(data.cover) != 'undefined' && data.cover) {

2
bot/HeroBot.mjs

@ -58,7 +58,7 @@ class HeroBot {
//等待所有内容加载完成 //等待所有内容加载完成
const tab = await hero.activeTab; const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//保存profile //保存profile
const latestUserProfile = await hero.exportUserProfile(); const latestUserProfile = await hero.exportUserProfile();

2
config.mjs

@ -65,7 +65,7 @@ let configs = {
//常用浏览器user-agent //常用浏览器user-agent
userAgents: { userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', iphone_safari: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',

86
test/douyin_test.mjs

@ -2,15 +2,33 @@ import Hero from '@ulixee/hero';
(async () => { (async () => {
const hero = new Hero({ const hero = new Hero({
connectionToCore: 'ws://192.168.3.13:1818', connectionToCore: 'ws://127.0.0.1:1818',
userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
//iphone 12 Pro
userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
viewport: {
width: 390,
height: 844
},
//mac mini
//userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
//viewport: {
// width: 1440,
// height: 900
//},
showChrome: true,
showChromeInteractions: true,
showDevtools: true,
showChromeAlive: true,
}); });
const url = 'https://v.douyin.com/iJr1NsJJ/'; const url = 'https://v.douyin.com/i2PBaR5B/';
console.log("请求 %s 中。。。", url); console.log("请求 %s 中。。。", url);
await hero.goto(url, { await hero.goto(url, {
timeoutMs: 10000, timeoutMs: 60000,
referrer: 'https://wechat.com', referrer: '',
}); });
//const title = await hero.document.title; //const title = await hero.document.title;
@ -18,20 +36,54 @@ import Hero from '@ulixee/hero';
//等待所有内容加载完成 //等待所有内容加载完成
const tab = await hero.activeTab; const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: 5000});
//mac mini
//await hero.waitForPaintingStable();
//await tab.waitForLoad('AllContentLoaded', {timeoutMs: 30000});
await tab.waitForLoad('DomContentLoaded', {timeoutMs: 30000});
//await hero.waitForState({
// all(assert) {
// assert(
// hero.detach( hero.document.querySelectorAll('img.poster') ),
// els => els && els.length > 0,
// );
// }
// }, {timeoutMs: 20000});
//console.log('poster封面图标签已经准备好');
console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded); console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded);
const elems = await hero.detach( hero.document.querySelectorAll('meta') );
console.log('数量', elems.length); //解析网页HTML数据
let meta_name = ''; let doc_url = await hero.document.location.href;
for (const elem of elems) { console.log('网址', doc_url);
meta_name = elem.getAttribute('name');
if (!meta_name) {continue;} //let doc_html = await hero.document.body.innerHTML;
meta_name = meta_name.toLowerCase(); //console.log('网页内容', doc_html);
if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) {
console.log('meta name %s, content: %s', meta_name, elem.getAttribute('content')); let title = await hero.document.title;
} console.log('网页标题', title);
}
const elem = await hero.querySelector('.video-container img.poster');
let imgUrl = '';
imgUrl = await elem.src;
console.log('post image url: %s', imgUrl);
//const elems = await hero.detach( hero.document.querySelectorAll('meta') );
//const elems = await hero.document.querySelectorAll('meta');
////console.log('数量', await elems.length);
//let meta_name = '';
//for (const elem in elems) {
// meta_name = await elem.getAttribute('name');
// if (!meta_name) {continue;}
// console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content'));
// meta_name = meta_name.toLowerCase();
// if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) {
// console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content'));
// }
//}
await hero.close(); await hero.close();
})().catch(error => { })().catch(error => {

10
test/scrap_test.mjs

@ -2,24 +2,26 @@ import Douyin from '../bot/Douyin.mjs';
import Kuaishou from '../bot/Kuaishou.mjs'; import Kuaishou from '../bot/Kuaishou.mjs';
import Xigua from '../bot/Xigua.mjs'; import Xigua from '../bot/Xigua.mjs';
import Bilibili from '../bot/Bilibili.mjs'; import Bilibili from '../bot/Bilibili.mjs';
import configs from '../config.mjs'; import getConfigs from '../config.mjs';
(async () => { (async () => {
//设置configs为全局变量
global.configs = await getConfigs();
let test_bot = 'douyin'; let test_bot = 'douyin';
if (process.argv.length == 3) { if (process.argv.length == 3) {
test_bot = process.argv[2]; test_bot = process.argv[2];
} }
console.log('当前测试Bot:%s', test_bot); console.log('当前测试Bot:%s', test_bot);
const heroCloudServer = 'ws://192.168.3.13:1818'; const heroCloudServer = 'ws://127.0.0.1:1818';
let url = '', data = {}; let url = '', data = {};
switch(test_bot) { switch(test_bot) {
case 'douyin': case 'douyin':
//抖音测试 //抖音测试
url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc url = 'https://v.douyin.com/i2PBaR5B/'; //mob and pc
url = 'https://v.douyin.com/i8sEyb6/'; //mob and pc
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时

Loading…
Cancel
Save