Browse Source

improve bot of douyin and xigua

master
filesite 8 months ago
parent
commit
5c46b8f036
  1. 2
      bot/Bilibili.mjs
  2. 87
      bot/Douyin.mjs
  3. 23
      bot/HeroBot.mjs
  4. 2
      bot/Kuaishou.mjs
  5. 12
      bot/Xigua.mjs
  6. 2
      test/scrap_test.mjs

2
bot/Bilibili.mjs

@ -87,6 +87,8 @@ class Bilibili extends HeroBot { @@ -87,6 +87,8 @@ class Bilibili extends HeroBot {
}catch(error) {
console.error("Error got when request %s via hero: %s", url, error);
await hero.close();
//删除profile文件后重试
await this.deleteProfile();
}
return data;

87
bot/Douyin.mjs

@ -9,8 +9,10 @@ class Douyin extends HeroBot { @@ -9,8 +9,10 @@ class Douyin extends HeroBot {
let data = {url: url, done: false};
//use iphone
configs.userAgent = configs.userAgents.iphone_safari;
configs.viewport = configs.viewports.mob;
if (this.ua == 'mob') {
configs.userAgent = configs.userAgents.iphone_safari;
configs.viewport = configs.viewports.mob;
}
let options = {
userAgent: configs.userAgent, //default mac os
@ -42,48 +44,54 @@ class Douyin extends HeroBot { @@ -42,48 +44,54 @@ class Douyin extends HeroBot {
const tab = await hero.activeTab;
//for mob
await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//for pc
//await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
//await hero.waitForState({
// all(assert) {
// assert(
// hero.document.title,
// text => text != '',
// );
// }
//}, {timeoutMs: configs.heroTabOptions.timeoutMs});
if (this.ua == 'mob') {
await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
}else {
//for pc
await tab.waitForLoad('DomContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs}); //AllContentLoaded, DomContentLoaded
await hero.waitForState({
all(assert) {
assert(
hero.document.title,
text => text != '',
);
}
}, {timeoutMs: configs.heroTabOptions.timeoutMs});
}
//解析网页HTML数据
data.title = await hero.document.title;
//pc版网页解析
/*
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
for (const elem of elems) {
meta_name = await elem.getAttribute('name');
if (!meta_name) {continue;}
meta_name = meta_name.toLowerCase();
//console.log('meta', meta_name);
if (meta_name.indexOf('video_cover_image_url') > -1) {
data.cover = await elem.getAttribute('content');
}else if (meta_name.indexOf('video_title') > -1) {
data.title = await elem.getAttribute('content');
if (this.ua == 'mob') {
//手机版网页解析
const imgElem = await hero.querySelector('.video-container img.poster');
if (!imgElem) {
console.error('HTML解析出错,找不到封面图', data);
await hero.close();
//删除profile文件后重试
await this.deleteProfile();
this.ua = 'pc'; //切换到pc模式重试
return false;
}
data.cover = await imgElem.src;
}else {
//pc版网页解析
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
for (const elem of elems) {
meta_name = await elem.getAttribute('name');
if (!meta_name) {continue;}
meta_name = meta_name.toLowerCase();
if (meta_name.indexOf('video_cover_image_url') > -1) {
data.cover = await elem.getAttribute('content');
}else if (meta_name.indexOf('video_title') > -1) {
data.title = await elem.getAttribute('content');
}
}
}
*/
//手机版网页解析
const imgElem = await hero.querySelector('.video-container img.poster');
if (!imgElem) {
console.error('HTML解析出错,找不到封面图', data);
await hero.close();
return false;
}
data.cover = await imgElem.src;
//get cover image's base64 data
if (typeof(data.cover) != 'undefined' && data.cover) {
@ -105,6 +113,11 @@ class Douyin extends HeroBot { @@ -105,6 +113,11 @@ class Douyin extends HeroBot {
}catch(error) {
console.error("Error got when request %s via hero: %s", url, error);
await hero.close();
//删除profile文件后重试
await this.deleteProfile();
//切换模式
this.ua = this.ua == 'mob' ? 'pc' : 'mob';
}
return data;

23
bot/HeroBot.mjs

@ -1,5 +1,6 @@ @@ -1,5 +1,6 @@
import Hero from '@ulixee/hero';
import fs from 'node:fs';
import {unlink} from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from 'url';
import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
@ -20,6 +21,9 @@ class HeroBot { @@ -20,6 +21,9 @@ class HeroBot {
const __filename = fileURLToPath(import.meta.url);
this.root = path.dirname(__filename);
//记录浏览器模式,便于在需要的时候切换
this.ua = 'pc';
}
//返回profile对象
@ -94,6 +98,25 @@ class HeroBot { @@ -94,6 +98,25 @@ class HeroBot {
return true;
}
//删除profile
async deleteProfile() {
if (this.name == '') {return false;}
const botName = this.name;
try {
const profilePath = path.resolve(this.root, '../tmp/', `profile_${botName}.json`);
if (fs.existsSync(profilePath) != false) {
return await unlink(profilePath);
}
}catch(error) {
console.error("Error got when delete profile of %s, error detail:\n%s", botName, error);
return false;
}
return true;
}
//处理name为空的cookie
fixCookies(profile) {
let fixedProfile = profile;

2
bot/Kuaishou.mjs

@ -68,6 +68,8 @@ class Kuaishou extends HeroBot { @@ -68,6 +68,8 @@ class Kuaishou extends HeroBot {
}catch(error) {
console.error("Error got when request %s via hero: %s", url, error);
await hero.close();
//删除profile文件后重试
await this.deleteProfile();
};
return data;

12
bot/Xigua.mjs

@ -41,7 +41,6 @@ class Xigua extends HeroBot { @@ -41,7 +41,6 @@ class Xigua extends HeroBot {
//解析网页HTML数据
data.title = await hero.document.title;
//data.url = await hero.url;
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
@ -57,6 +56,15 @@ class Xigua extends HeroBot { @@ -57,6 +56,15 @@ class Xigua extends HeroBot {
}
}
//尝试从 <xg-poster class="xgplayer-poster hide"> 再获取一次
if (typeof(data.cover) == 'undefined' || !data.cover) {
const imgTag = await tab.querySelector('xg-poster');
let backgroundCss = await imgTag.style.backdgroundImage;
if (backgroundCss && /url\(.+\)/i.test(backgroundCss)) {
data.cover = backgroundCss.replace('url(', '').replace(')', '').replace('"', '');
}
}
//get cover image's base64 data
if (typeof(data.cover) != 'undefined' && data.cover) {
data.cover = common.getAbsoluteUrl(data.cover);
@ -77,6 +85,8 @@ class Xigua extends HeroBot { @@ -77,6 +85,8 @@ class Xigua extends HeroBot {
}catch(error) {
console.error("Error got when request %s via hero: %s", url, error);
await hero.close();
//删除profile文件后重试
await this.deleteProfile();
};
return data;

2
test/scrap_test.mjs

@ -61,7 +61,7 @@ import getConfigs from '../config.mjs'; @@ -61,7 +61,7 @@ import getConfigs from '../config.mjs';
case 'xigua':
//西瓜测试
url = 'https://v.ixigua.com/ieUaqrFN/'; //mobile
url = 'https://www.ixigua.com/7248225527335813636'; //pc
url = 'https://www.ixigua.com/7343928492197118518'; //pc
configs.heroBotOptions.referrer = url;
configs.userAgent = configs.userAgents.mac_chrome;

Loading…
Cancel
Save