From 70bf6ebf25610a6ed95fc7fe4d3ea43f05ec6c2c Mon Sep 17 00:00:00 2001 From: filesite Date: Mon, 20 May 2024 23:40:15 +0800 Subject: [PATCH] douyin livestream support --- bot/Douyin.mjs | 30 +++++++++++++++++++++++------- config.mjs | 13 ++++++------- lib/common.mjs | 5 +++++ spider.mjs | 6 +++++- test/cloud_test.mjs | 27 ++++++++++++++------------- test/douyin_test.mjs | 44 ++++++++++++++++++++++++++++++-------------- test/scrap_test.mjs | 10 ++++++---- 7 files changed, 89 insertions(+), 46 deletions(-) diff --git a/bot/Douyin.mjs b/bot/Douyin.mjs index be231c3..eadf087 100644 --- a/bot/Douyin.mjs +++ b/bot/Douyin.mjs @@ -59,22 +59,38 @@ class Douyin extends HeroBot { }, {timeoutMs: configs.heroTabOptions.timeoutMs}); } + let rnd_secods = 10 + parseInt(Math.random() * 10); + console.log("Sleep %s seconds...", rnd_secods); + await common.delay(rnd_secods); //解析网页HTML数据 data.title = await hero.document.title; if (this.ua == 'mob') { //手机版网页解析 - const imgElem = await hero.querySelector('.video-container img.poster'); + let imgElem = await hero.querySelector('.video-container img.poster'); + let elType = 'image'; if (!imgElem) { - console.error('HTML解析出错,找不到封面图', data); - await hero.close(); - //删除profile文件后重试 - await this.deleteProfile(); - return false; + //尝试去抓取video的poster属性 + imgElem = await hero.querySelector('.xgplayer video'); + elType = 'video'; + + if (!imgElem) { + //尝试获取用户头像作为封面图,兼容直播页面 + imgElem = await hero.querySelector('.avatar-component-avatar-container img'); + elType = 'image'; + + if (!imgElem) { + console.error('HTML解析出错,找不到封面图', data); + await hero.close(); + //删除profile文件后重试 + await this.deleteProfile(); + return false; + } + } } - data.cover = await imgElem.src; + data.cover = elType == 'image' ? await imgElem.src : await imgElem.poster; }else { //pc版网页解析 const elems = await hero.document.querySelectorAll('meta'); diff --git a/config.mjs b/config.mjs index e7927c0..24daa2e 100644 --- a/config.mjs +++ b/config.mjs @@ -38,10 +38,10 @@ let configs = { //可选项参考官方文档:https://ulixee.org/docs/hero/basic-client/hero botOptions: { - showChrome: false, - showChromeInteractions: false, - showDevtools: false, - showChromeAlive: false, + showChrome: true, + showChromeInteractions: true, + showDevtools: true, + showChromeAlive: true, }, viewports: { @@ -76,10 +76,9 @@ let configs = { }; -async function getConfig() { +async function getConfig(configFile) { //自定义JSON格式配置文件支持 - if (process.argv.length >= 3) { - let configFile = process.argv[2]; + if (typeof(configFile) != 'undefined' && configFile) { let myConfigs = await common.loadCustomizeConfig(configFile); if (myConfigs) { configs = common.mergeConfigs(myConfigs, configs); diff --git a/lib/common.mjs b/lib/common.mjs index 4d81f57..7e75101 100644 --- a/lib/common.mjs +++ b/lib/common.mjs @@ -1,6 +1,7 @@ //公用方法 import { readdir, readFile, rm as removeFile, appendFile } from 'node:fs/promises'; import path from 'node:path'; +import { setTimeout } from 'node:timers/promises'; export default { @@ -108,6 +109,10 @@ export default { } return saved; + }, + + delay: async function(seconds) { + await setTimeout(seconds * 1000); } }; diff --git a/spider.mjs b/spider.mjs index db9e592..13b594b 100644 --- a/spider.mjs +++ b/spider.mjs @@ -22,7 +22,11 @@ import path from 'node:path'; (async () => { //设置configs为全局变量 - global.configs = await getConfigs(); + let configFile = ''; + if (process.argv.length >= 3) { + configFile = process.argv[2]; + } + global.configs = await getConfigs(configFile); const taskMoniter = new TaskMoniter(configs.task_list_dir); const tajian = new TaJian(configs.data_save_dir); diff --git a/test/cloud_test.mjs b/test/cloud_test.mjs index 4e1761e..a4bba21 100644 --- a/test/cloud_test.mjs +++ b/test/cloud_test.mjs @@ -1,21 +1,22 @@ import Hero from '@ulixee/hero'; (async () => { - const hero = new Hero({ connectionToCore: 'ws://127.0.0.1:1818' }); + const hero = new Hero({ connectionToCore: 'ws://127.0.0.1:1818' }); - //const url = 'https://filesite.io'; - //const url = 'https://www.google.com'; - const url = 'https://v.douyin.com/iJr1NsJJ/'; - await hero.goto(url, { - timeoutMs: 20000, - referrer: '-' - }); + //const url = 'https://filesite.io'; + //const url = 'https://www.google.com'; + let url = 'https://v.douyin.com/iJr1NsJJ/'; - const title = await hero.document.title; - console.log("Page title:\n", title); + await hero.goto(url, { + timeoutMs: 20000, + referrer: '-' + }); - await hero.close(); + const title = await hero.document.title; + console.log("Page title:\n", title); + + await hero.close(); })().catch(error => { - console.error("Error got:\n%s", error); - process.exit(1); + console.error("Error got:\n%s", error); + process.exit(1); }); diff --git a/test/douyin_test.mjs b/test/douyin_test.mjs index b7c1a58..cfac947 100644 --- a/test/douyin_test.mjs +++ b/test/douyin_test.mjs @@ -1,6 +1,8 @@ import Hero from '@ulixee/hero'; (async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const hero = new Hero({ connectionToCore: 'ws://127.0.0.1:1818', @@ -24,7 +26,9 @@ import Hero from '@ulixee/hero'; showChromeAlive: true, }); - const url = 'https://v.douyin.com/i2PBaR5B/'; + let url = 'https://v.douyin.com/i2PBaR5B/'; + //直播地址测试 + url = 'https://v.douyin.com/i2WaMoAN/'; console.log("请求 %s 中。。。", url); await hero.goto(url, { timeoutMs: 60000, @@ -42,15 +46,17 @@ import Hero from '@ulixee/hero'; //await tab.waitForLoad('AllContentLoaded', {timeoutMs: 30000}); await tab.waitForLoad('DomContentLoaded', {timeoutMs: 30000}); - //await hero.waitForState({ - // all(assert) { - // assert( - // hero.detach( hero.document.querySelectorAll('img.poster') ), - // els => els && els.length > 0, - // ); - // } - // }, {timeoutMs: 20000}); - //console.log('poster封面图标签已经准备好'); + /* + await hero.waitForState({ + all(assert) { + assert( + hero.detach( hero.document.querySelectorAll('img.poster') ), + els => els && els.length > 0, + ); + } + }, {timeoutMs: 20000}); + console.log('poster封面图标签已经准备好'); + */ console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded); @@ -59,16 +65,26 @@ import Hero from '@ulixee/hero'; let doc_url = await hero.document.location.href; console.log('网址', doc_url); + let rnd_secods = parseInt(Math.random() * 20); + console.log("Sleep %s seconds...", rnd_secods); + await delay(1000*rnd_secods); //sleep + + //let doc_html = await hero.document.body.innerHTML; //console.log('网页内容', doc_html); let title = await hero.document.title; console.log('网页标题', title); - const elem = await hero.querySelector('.video-container img.poster'); + let elem = await hero.querySelector('.video-container img.poster'); + elem = await hero.querySelector('.xgplayer video'); + let imgUrl = ''; - imgUrl = await elem.src; - console.log('post image url: %s', imgUrl); + //imgUrl = await elem.src; + if (elem) { + imgUrl = await elem.poster; + console.log('post image url: %s', imgUrl); + } //const elems = await hero.detach( hero.document.querySelectorAll('meta') ); //const elems = await hero.document.querySelectorAll('meta'); @@ -85,7 +101,7 @@ import Hero from '@ulixee/hero'; // } //} - await hero.close(); + await hero.close(); })().catch(error => { console.error("Error got:\n%s", error); process.exit(1); diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs index 20e851a..bfbe5e6 100644 --- a/test/scrap_test.mjs +++ b/test/scrap_test.mjs @@ -24,17 +24,19 @@ import getConfigs from '../config.mjs'; //抖音测试 url = 'https://v.douyin.com/i2PBaR5B/'; //mob and pc - configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 + //直播地址测试 + url = 'https://v.douyin.com/i2WaMoAN/'; - configs.userAgent = configs.userAgents.mac_chrome; - configs.viewport = configs.viewports.pc; + configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 - console.log('Hero配置', configs); + //configs.userAgent = configs.userAgents.mac_chrome; + //configs.viewport = configs.viewports.pc; const douyin = new Douyin(heroCloudServer); //使用手机模式,默认为pc douyin.setMode('mob'); + console.log('Hero配置', configs); console.log('请求中: %s ...', url); data = await douyin.scrap(url);