From f978e2a429130c118076ceee5b29d1c2f5518198 Mon Sep 17 00:00:00 2001 From: filesite Date: Sun, 10 Sep 2023 23:54:49 +0800 Subject: [PATCH] bot lib for douyin done --- bot/Douyin.mjs | 51 ++++++++++++++++++++++++++++++++++++++++++++ bypass/douyin.md | 13 +++++++++++ package.json | 2 ++ test/cloud_test.mjs | 7 +++--- test/douyin_test.mjs | 38 +++++++++++++++++++++++++++++++++ test/scrap_test.mjs | 10 +++++++++ 6 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 bot/Douyin.mjs create mode 100644 bypass/douyin.md create mode 100644 test/douyin_test.mjs create mode 100644 test/scrap_test.mjs diff --git a/bot/Douyin.mjs b/bot/Douyin.mjs new file mode 100644 index 0000000..7994d46 --- /dev/null +++ b/bot/Douyin.mjs @@ -0,0 +1,51 @@ +import Hero from '@ulixee/hero'; + +class Douyin { + constructor(heroCloudServer) { + this.heroServer = heroCloudServer ? heroCloudServer : ''; + } + + async scrap(url) { + let data = {}; + + try { + let options = {}; + if (this.heroServer) { + options.connectionToCore = this.heroServer; + } + + const hero = new Hero(options); + await hero.goto(url, { + timeoutMs: 10000, + referrer: 'https://wechat.com', + userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1' + }); + + //等待所有内容加载完成 + const tab = await hero.activeTab; + await tab.waitForLoad('AllContentLoaded', {timeoutMs: 5000}); + + const elems = await hero.detach( hero.document.querySelectorAll('meta') ); + let meta_name = ''; + for (const elem of elems) { + meta_name = elem.getAttribute('name'); + if (!meta_name) {continue;} + meta_name = meta_name.toLowerCase(); + if (meta_name.indexOf('video_cover_image_url') > -1) { + data.cover = elem.getAttribute('content'); + }else if (meta_name.indexOf('video_title') > -1) { + data.title = elem.getAttribute('content'); + } + } + + await hero.close(); + }catch(error) { + console.error('ERROR when request url via hero', url, error); + }; + + return data; + } + +} + +export default Douyin; diff --git a/bypass/douyin.md b/bypass/douyin.md new file mode 100644 index 0000000..1bb7d6a --- /dev/null +++ b/bypass/douyin.md @@ -0,0 +1,13 @@ + +## 抖音的域名 + +``` +douyin.com, *.douyin.com, *.iesdouyin.com, *.zijieapi.com, *.bytedance.com, *.yhgfb-cn-static.com, *.usergrowth.com.cn, *.bytescm.com, *.ibytedapm.com, *.bytetos.com, *.douyinpic.com, *.zjcdn.com +``` + + +来自抖音的安全策略: +``` +Content Security Policy directive: "default-src 'self' blob: data: 'unsafe-inline' 'unsafe-eval' *.zijieapi.com *.ibytedapm.com *.bytetos.com *.bytednsdoc.com *.zijieimg.com *.zjurl.cn *.pstatp.com *.bytecdn.cn *.isnssdk.com *.365yg.com *.ipstatp.com *.amemv.com *.ibytedtos.com *.ixigua.com *.ixiguavideo.com *.hypstarcdn.com *.tiktokcdn.com *.topbuzzcdn.com *.muscdn.com *.huoshanzhibo.com *.huoshanxiaoshipin.cn *.huoshanxiaoshipin.net *.huoshanvideo.cn *.huoshanvideo.net *.ieshuodong.cn *.ieshuodong.net *.byteoversea.com *.byted.org *.bytedance.net *.bytescm.com *.bytedance.com *.toutiaocloud.com *.snssdk.com *.toutiao.com *.huoshan.com *.douyin.com *.douyincdn.com *.jinritemai.com *.chengzijianzhan.com *.baike.com *.ribaoapi.com *.bytexservice.com *.pglstatp-toutiao.com *.oceanengine.com *.dyvideotape.com at.alicdn.com g.alicdn.com *.iesdouyin.com *.m.douyin.com *.byteimg.com *.zjcdn.com bytedance: android-webview-video-poster: snssdk1128: *.bytednsdoc.com *.douyinpic.com *.douyinstatic.com *.bdxiguaimg.com *.bdxiguastatic.com *.bytegoofy.com unpkg.com unpkg.byted-static.com *.draftstatic.com *.bytetcc.com +``` + diff --git a/package.json b/package.json index 176d218..c426d68 100644 --- a/package.json +++ b/package.json @@ -1,4 +1,6 @@ { + "name": "machege-hero", + "type": "module", "dependencies": { "@ulixee/cloud": "^2.0.0-alpha.24", "@ulixee/hero": "^2.0.0-alpha.24" diff --git a/test/cloud_test.mjs b/test/cloud_test.mjs index bdddf90..2d3bef4 100644 --- a/test/cloud_test.mjs +++ b/test/cloud_test.mjs @@ -3,15 +3,16 @@ import Hero from '@ulixee/hero'; (async () => { const hero = new Hero({ connectionToCore: 'ws://192.168.3.13:1818' }); - const url = 'https://filesite.io'; + //const url = 'https://filesite.io'; //const url = 'https://www.google.com'; + const url = 'https://v.douyin.com/iJr1NsJJ/'; await hero.goto(url, { - timeoutMs: 120000, + timeoutMs: 20000, referrer: '-' }); const title = await hero.document.title; - console.log('Page title', title); + console.log("Page title:\n", title); await hero.close(); })().catch(error => { diff --git a/test/douyin_test.mjs b/test/douyin_test.mjs new file mode 100644 index 0000000..61356aa --- /dev/null +++ b/test/douyin_test.mjs @@ -0,0 +1,38 @@ +import Hero from '@ulixee/hero'; + +(async () => { + const hero = new Hero({ connectionToCore: 'ws://192.168.3.13:1818' }); + + const url = 'https://v.douyin.com/iJr1NsJJ/'; + console.log("请求 %s 中。。。", url); + await hero.goto(url, { + timeoutMs: 10000, + referrer: 'https://wechat.com', + userAgent: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1' + }); + + //const title = await hero.document.title; + //console.log("Page title:\n", title); + + //等待所有内容加载完成 + const tab = await hero.activeTab; + await tab.waitForLoad('AllContentLoaded', {timeoutMs: 5000}); + console.log('加载完成', await hero.isPaintingStable, await hero.isDomContentLoaded, await hero.isAllContentLoaded); + + const elems = await hero.detach( hero.document.querySelectorAll('meta') ); + console.log('数量', elems.length); + let meta_name = ''; + for (const elem of elems) { + meta_name = elem.getAttribute('name'); + if (!meta_name) {continue;} + meta_name = meta_name.toLowerCase(); + if (meta_name.indexOf('video_cover_image_url') > -1 || meta_name.indexOf('video_title') > -1) { + console.log('meta name %s, content: %s', meta_name, elem.getAttribute('content')); + } + } + + await hero.close(); +})().catch(error => { + console.log('ERROR when request url via hero', error); + process.exit(1); +}); diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs new file mode 100644 index 0000000..3eae143 --- /dev/null +++ b/test/scrap_test.mjs @@ -0,0 +1,10 @@ +import Douyin from '../bot/Douyin.mjs'; + +(async () => { + const heroCloudServer = 'ws://192.168.3.13:1818'; + const douyin = new Douyin(heroCloudServer); + const url = 'https://v.douyin.com/iJr1NsJJ/'; + console.log('请求中: %s ...', url); + const data = await douyin.scrap(url); + console.log('抖音网址 %s 解析结果: %s', url, JSON.stringify(data)); +})();