Browse Source

add iphone html pars for douyin

master
filesite 7 months ago
parent
commit
e186750d67
  1. 43
      bot/Douyin.mjs
  2. 2
      bot/HeroBot.mjs
  3. 2
      config.mjs
  4. 86
      test/douyin_test.mjs
  5. 10
      test/scrap_test.mjs

43
bot/Douyin.mjs

@ -8,8 +8,12 @@ class Douyin extends HeroBot { @@ -8,8 +8,12 @@ class Douyin extends HeroBot {
async scrap(url) {
let data = {url: url, done: false};
//use iphone
configs.userAgent = configs.userAgents.iphone_safari;
configs.viewport = configs.viewports.mob;
let options = {
userAgent: configs.userAgent,
userAgent: configs.userAgent, //default mac os
viewport: configs.viewport,
};
@ -36,20 +40,26 @@ class Douyin extends HeroBot { @@ -36,20 +40,26 @@ class Douyin extends HeroBot {
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await hero.waitForState({
all(assert) {
assert(
hero.document.title,
text => text != '',
);
}
}, {timeoutMs: configs.heroTabOptions.timeoutMs});
//for mob
await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//for pc
//await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//await hero.waitForState({
// all(assert) {
// assert(
// hero.document.title,
// text => text != '',
// );
// }
//}, {timeoutMs: configs.heroTabOptions.timeoutMs});
//解析网页HTML数据
data.title = await hero.document.title;
//data.url = await hero.url;
//pc版网页解析
/*
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
for (const elem of elems) {
@ -63,6 +73,17 @@ class Douyin extends HeroBot { @@ -63,6 +73,17 @@ class Douyin extends HeroBot {
data.title = await elem.getAttribute('content');
}
}
*/
//手机版网页解析
const imgElem = await hero.querySelector('.video-container img.poster');
if (!imgElem) {
console.error('HTML解析出错,找不到封面图', data);
await hero.close();
return false;
}
data.cover = await imgElem.src;
//get cover image's base64 data
if (typeof(data.cover) != 'undefined' && data.cover) {

2
bot/HeroBot.mjs

@ -58,7 +58,7 @@ class HeroBot { @@ -58,7 +58,7 @@ class HeroBot {
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//保存profile
const latestUserProfile = await hero.exportUserProfile();

2
config.mjs

@ -65,7 +65,7 @@ let configs = { @@ -65,7 +65,7 @@ let configs = {
//常用浏览器user-agent
userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_safari: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',

86
test/douyin_test.mjs

@ -2,15 +2,33 @@ import Hero from '@ulixee/hero'; @@ -2,15 +2,33 @@ import Hero from '@ulixee/hero';
(async () => {
const hero = new Hero({
connectionToCore: 'ws://192.168.3.13:1818',
userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
connectionToCore: 'ws://127.0.0.1:1818',
//iphone 12 Pro
userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
viewport: {
width: 390,
height: 844
},
//mac mini
//userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
//viewport: {
// width: 1440,
// height: 900
//},
showChrome: true,
showChromeInteractions: true,
showDevtools: true,
showChromeAlive: true,
});
const url = 'https://v.douyin.com/iJr1NsJJ/';
const url = 'https://v.douyin.com/i2PBaR5B/';
console.log("请求 %s 中。。。", url);
await hero.goto(url, {
timeoutMs: 10000,
referrer: 'https://wechat.com',
timeoutMs: 60000,
referrer: '',
});
//const title = await hero.document.title;
@ -18,20 +36,54 @@ import Hero from '@ulixee/hero'; @@ -18,20 +36,54 @@ import Hero from '@ulixee/hero';
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: 5000});
//mac mini
//await hero.waitForPaintingStable();
//await tab.waitForLoad('AllContentLoaded', {timeoutMs: 30000});
await tab.waitForLoad('DomContentLoaded', {timeoutMs: 30000});
//await hero.waitForState({
// all(assert) {
// assert(
// hero.detach( hero.document.querySelectorAll('img.poster') ),
// els => els && els.length > 0,
// );
// }
// }, {timeoutMs: 20000});
//console.log('poster封面图标签已经准备好');
console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded);
const elems = await hero.detach( hero.document.querySelectorAll('meta') );
console.log('数量', elems.length);
let meta_name = '';
for (const elem of elems) {
meta_name = elem.getAttribute('name');
if (!meta_name) {continue;}
meta_name = meta_name.toLowerCase();
if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) {
console.log('meta name %s, content: %s', meta_name, elem.getAttribute('content'));
}
}
//解析网页HTML数据
let doc_url = await hero.document.location.href;
console.log('网址', doc_url);
//let doc_html = await hero.document.body.innerHTML;
//console.log('网页内容', doc_html);
let title = await hero.document.title;
console.log('网页标题', title);
const elem = await hero.querySelector('.video-container img.poster');
let imgUrl = '';
imgUrl = await elem.src;
console.log('post image url: %s', imgUrl);
//const elems = await hero.detach( hero.document.querySelectorAll('meta') );
//const elems = await hero.document.querySelectorAll('meta');
////console.log('数量', await elems.length);
//let meta_name = '';
//for (const elem in elems) {
// meta_name = await elem.getAttribute('name');
// if (!meta_name) {continue;}
// console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content'));
// meta_name = meta_name.toLowerCase();
// if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) {
// console.log('meta name %s, content: %s', meta_name, await elem.getAttribute('content'));
// }
//}
await hero.close();
})().catch(error => {

10
test/scrap_test.mjs

@ -2,24 +2,26 @@ import Douyin from '../bot/Douyin.mjs'; @@ -2,24 +2,26 @@ import Douyin from '../bot/Douyin.mjs';
import Kuaishou from '../bot/Kuaishou.mjs';
import Xigua from '../bot/Xigua.mjs';
import Bilibili from '../bot/Bilibili.mjs';
import configs from '../config.mjs';
import getConfigs from '../config.mjs';
(async () => {
//设置configs为全局变量
global.configs = await getConfigs();
let test_bot = 'douyin';
if (process.argv.length == 3) {
test_bot = process.argv[2];
}
console.log('当前测试Bot:%s', test_bot);
const heroCloudServer = 'ws://192.168.3.13:1818';
const heroCloudServer = 'ws://127.0.0.1:1818';
let url = '', data = {};
switch(test_bot) {
case 'douyin':
//抖音测试
url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc
url = 'https://v.douyin.com/i8sEyb6/'; //mob and pc
url = 'https://v.douyin.com/i2PBaR5B/'; //mob and pc
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时

Loading…
Cancel
Save