Browse Source

pc and mob share link test done for douyin, kuaishou and xigua

master
filesite 1 year ago
parent
commit
866e59c2d5
  1. 39
      bot/Douyin.mjs
  2. 47
      bot/HeroBot.mjs
  3. 12
      bot/Kuaishou.mjs
  4. 18
      bot/Xigua.mjs
  5. 19
      config.mjs
  6. 32
      test/scrap_test.mjs

39
bot/Douyin.mjs

@ -1,45 +1,64 @@ @@ -1,45 +1,64 @@
import Hero from '@ulixee/hero';
import configs from '../config.mjs';
import HeroBot from './HeroBot.mjs';
class Douyin {
constructor(heroCloudServer) {
this.heroServer = heroCloudServer ? heroCloudServer : '';
}
class Douyin extends HeroBot {
async scrap(url) {
let data = {};
try {
let options = {};
let options = {
userAgent: configs.userAgent,
viewport: configs.viewport
};
if (this.heroServer) {
options.connectionToCore = this.heroServer;
}
const profile = await this.init('douyin');
if (profile) {
options.userProfile = this.fixCookies(profile);
}
const hero = new Hero(options);
await hero.goto(url, configs.heroBotOptions);
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await hero.waitForState({
all(assert) {
assert(
hero.document.title,
text => text != '',
);
}
});
//解析网页HTML数据
const elems = await hero.detach( hero.document.querySelectorAll('meta') );
data.title = await hero.document.title;
//data.url = await hero.url;
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
for (const elem of elems) {
meta_name = elem.getAttribute('name');
meta_name = await elem.getAttribute('name');
if (!meta_name) {continue;}
meta_name = meta_name.toLowerCase();
//console.log('meta', meta_name);
if (meta_name.indexOf('video_cover_image_url') > -1) {
data.cover = elem.getAttribute('content');
data.cover = await elem.getAttribute('content');
}else if (meta_name.indexOf('video_title') > -1) {
data.title = elem.getAttribute('content');
data.title = await elem.getAttribute('content');
}
}
await hero.close();
}catch(error) {
console.error("Error got when request %s via hero: %s", url, error);
};
}
return data;
}

47
bot/HeroBot.mjs

@ -28,7 +28,10 @@ class HeroBot { @@ -28,7 +28,10 @@ class HeroBot {
try {
this.name = botName;
let options = {};
let options = {
userAgent: configs.userAgent,
viewport: configs.viewport
};
if (this.heroServer) {
options.connectionToCore = this.heroServer;
@ -37,10 +40,11 @@ class HeroBot { @@ -37,10 +40,11 @@ class HeroBot {
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`);
if (fs.existsSync(profilePath) != false) {
const json = fs.readFileSync(profilePath, { encoding: 'utf8' });
options.userProfile = JSON.parse(json);
return options.userProfile;
return JSON.parse(json);
}
console.log('Hero init配置', configs);
const hero = new Hero(options);
await hero.goto(base_url, configs.heroBotOptions);
@ -50,18 +54,51 @@ class HeroBot { @@ -50,18 +54,51 @@ class HeroBot {
//保存profile
const latestUserProfile = await hero.exportUserProfile();
fs.writeFileSync(profilePath, JSON.stringify(latestUserProfile, null, 2));
this.saveProfile(latestUserProfile);
await hero.close();
return latestUserProfile;
}catch(error) {
console.error("Error got when request %s via hero: %s", base_url, error);
};
}
return false;
}
//保存profile
saveProfile(profile) {
if (this.name == '') {return false;}
const botName = this.name;
try {
//保存profile
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`);
fs.writeFileSync(profilePath, JSON.stringify(profile, null, 2));
}catch(error) {
console.error("Error got when save profile of %s, error detail:\n%s", botName, error);
return false;
}
return true;
}
//处理name为空的cookie
fixCookies(profile) {
let fixedProfile = profile;
if (typeof(profile.cookies) == 'undefined') {return profile;}
const botName = this.name;
for (const index in profile.cookies) {
if (profile.cookies[index].name == '') {
fixedProfile.cookies[index].name = botName;
}
}
return fixedProfile;
}
}
export default HeroBot;

12
bot/Kuaishou.mjs

@ -8,7 +8,10 @@ class Kuaishou extends HeroBot { @@ -8,7 +8,10 @@ class Kuaishou extends HeroBot {
let data = {};
try {
let options = {};
let options = {
userAgent: configs.userAgent,
viewport: configs.viewport
};
if (this.heroServer) {
options.connectionToCore = this.heroServer;
@ -25,13 +28,16 @@ class Kuaishou extends HeroBot { @@ -25,13 +28,16 @@ class Kuaishou extends HeroBot {
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await hero.waitForPaintingStable();
//解析网页HTML数据
data.title = await hero.document.title;
//data.url = await hero.url;
const elem = await hero.detach( hero.document.querySelector('.video-container-player') );
data.cover = elem.getAttribute('poster');
const elem = hero.document.querySelector('.video-container-player');
if (elem) {
data.cover = await elem.getAttribute('poster');
}
await hero.close();
}catch(error) {

18
bot/Xigua.mjs

@ -8,7 +8,10 @@ class Xigua extends HeroBot { @@ -8,7 +8,10 @@ class Xigua extends HeroBot {
let data = {};
try {
let options = {};
let options = {
userAgent: configs.userAgent,
viewport: configs.viewport
};
if (this.heroServer) {
options.connectionToCore = this.heroServer;
@ -25,18 +28,23 @@ class Xigua extends HeroBot { @@ -25,18 +28,23 @@ class Xigua extends HeroBot {
//等待所有内容加载完成
const tab = await hero.activeTab;
await tab.waitForLoad('AllContentLoaded', {timeoutMs: configs.heroTabOptions.timeoutMs});
await hero.waitForPaintingStable();
//解析网页HTML数据
const elems = await hero.detach( hero.document.querySelectorAll('meta') );
data.title = await hero.document.title;
//data.url = await hero.url;
const elems = await hero.document.querySelectorAll('meta');
let meta_name = '';
for (const elem of elems) {
meta_name = elem.getAttribute('name');
meta_name = await elem.getAttribute('name');
if (!meta_name) {continue;}
meta_name = meta_name.toLowerCase();
//console.log('meta', meta_name);
if (meta_name.indexOf('og:image') > -1) {
data.cover = elem.getAttribute('content');
data.cover = await elem.getAttribute('content');
}else if (meta_name.indexOf('og:title') > -1) {
data.title = elem.getAttribute('content');
data.title = await elem.getAttribute('content');
}
}

19
config.mjs

@ -1,10 +1,25 @@ @@ -1,10 +1,25 @@
export default {
userAgent: '~ chrome >= 114 && mac',
viewport: {
width: 1440,
height: 900
},
viewports: {
mob: {
width: 375,
height: 667
},
pc: {
width: 1440,
height: 900
},
},
//请求参数
heroBotOptions: {
timeoutMs: 10000,
referrer: '',
userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN'
},
//网页tab参数
@ -16,7 +31,7 @@ export default { @@ -16,7 +31,7 @@ export default {
userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
}

32
test/scrap_test.mjs

@ -17,8 +17,16 @@ import configs from '../config.mjs'; @@ -17,8 +17,16 @@ import configs from '../config.mjs';
case 'douyin':
//抖音测试
url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时
configs.userAgent = configs.userAgents.mac_chrome;
configs.viewport = configs.viewports.pc;
console.log('Hero配置', configs);
const douyin = new Douyin(heroCloudServer);
url = 'https://v.douyin.com/iJr1NsJJ/';
console.log('请求中: %s ...', url);
data = await douyin.scrap(url);
console.log("解析结果:\n%s", JSON.stringify(data));
@ -28,15 +36,17 @@ import configs from '../config.mjs'; @@ -28,15 +36,17 @@ import configs from '../config.mjs';
case 'kuaishou':
//快手测试
url = 'https://www.kuaishou.com/f/X8FTguiIjZQVwE7'; //pc
//url = 'https://v.kuaishou.com/7zwqe6'; //mob
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时
configs.heroBotOptions.userAgent = configs.userAgents.iphone_wechat;
//configs.heroBotOptions.userAgent = configs.userAgents.mac_chrome;
configs.userAgent = configs.userAgents.mac_chrome;
configs.viewport = configs.viewports.pc;
console.log('Hero配置', configs);
const kuaishou = new Kuaishou(heroCloudServer);
//url = 'https://www.kuaishou.com/f/X8FTguiIjZQVwE7';
url = 'https://v.kuaishou.com/7zwqe6';
//url = 'https://www.kuaishou.com/';
console.log('请求中: %s ...', url);
data = await kuaishou.scrap(url);
console.log("解析结果:\n%s", JSON.stringify(data));
@ -46,15 +56,19 @@ import configs from '../config.mjs'; @@ -46,15 +56,19 @@ import configs from '../config.mjs';
case 'xigua':
//西瓜测试
//url = 'https://www.ixigua.com/7092326495246516749'; //pc
url = 'https://v.ixigua.com/ieUaqrFN/'; //mobile
url = 'https://www.ixigua.com/7248225527335813636'; //pc
configs.heroBotOptions.referrer = url;
configs.heroBotOptions.userAgent = configs.userAgents.mac_chrome;
configs.userAgent = configs.userAgents.mac_chrome;
configs.viewport = configs.viewports.pc;
console.log('Hero配置', configs);
const xigua = new Xigua(heroCloudServer);
console.log('请求中: %s ...', url);
data = await xigua.scrap(`${url}?wid_try=1`);
//data = await xigua.scrap(`${url}?wid_try=1`);
data = await xigua.scrap(url);
console.log("解析结果:\n%s", JSON.stringify(data));
break;

Loading…
Cancel
Save