diff --git a/.gitignore b/.gitignore index b939fe3..bf2b2ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ package-lock.json node_modules/ tmp/profile*.json +todo/*.todo +data/*.url +data/*.jpg +data/*.png +data/*.txt diff --git a/bot/Bilibili.mjs b/bot/Bilibili.mjs index 4a71bb2..780998c 100644 --- a/bot/Bilibili.mjs +++ b/bot/Bilibili.mjs @@ -59,6 +59,7 @@ class Bilibili extends HeroBot { await hero.close(); + data.bot = this.name; data.done = true; }catch(error) { console.error("Error got when request %s via hero: %s", url, error); diff --git a/bot/Douyin.mjs b/bot/Douyin.mjs index 76219dc..82e4849 100644 --- a/bot/Douyin.mjs +++ b/bot/Douyin.mjs @@ -19,6 +19,7 @@ class Douyin extends HeroBot { } const profile = await this.init('douyin'); + data.bot = this.name; if (profile) { options.userProfile = profile; } diff --git a/bot/Kuaishou.mjs b/bot/Kuaishou.mjs index 43c4769..36aec49 100644 --- a/bot/Kuaishou.mjs +++ b/bot/Kuaishou.mjs @@ -43,6 +43,7 @@ class Kuaishou extends HeroBot { await hero.close(); + data.bot = this.name; data.done = true; }catch(error) { console.error("Error got when request %s via hero: %s", url, error); diff --git a/bot/Xigua.mjs b/bot/Xigua.mjs index 430a6e6..19e3500 100644 --- a/bot/Xigua.mjs +++ b/bot/Xigua.mjs @@ -52,6 +52,7 @@ class Xigua extends HeroBot { await hero.close(); + data.bot = this.name; data.done = true; }catch(error) { console.error("Error got when request %s via hero: %s", url, error); diff --git a/config.mjs b/config.mjs index 7da4b4b..dd69a9f 100644 --- a/config.mjs +++ b/config.mjs @@ -1,4 +1,10 @@ export default { + //自动任务相关配置 + task_list_dir: 'todo/', //待抓取任务文件保存目录 + task_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io + + + //bot相关配置 userAgent: '~ chrome >= 114 && mac', viewport: { width: 1440, diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..43d1281 --- /dev/null +++ b/data/README.md @@ -0,0 +1,32 @@ + +## 视频网页抓取数据保存目录 + +按照FileSite.io的.url快捷方式文件格式保存, +并把相关数据以.txt描述文件保存。 + +示例如下: + +文件名:20230913001.url + +文件内容: +``` +[InternetShortcut] +URL=https://v.ixigua.com/ieUaqrFN/ +``` + +标题描述文件20230913001_title.txt: +``` +漂亮妻子留守农村,好心丈夫托人过来帮忙,不料竟引发悲剧,影视 - 西瓜视频 +``` + +标题描述文件20230913001_cover.txt: +``` +http://p26-sign.bdxiguaimg.com/tos-cn-i-0004/ogB8EBP9dzAj3PApA2fDAIyACAbQuBpSIBN8Wh~tplv-pk90l89vgd-crop-center:864:486.jpeg?appId=1768&channelId=0&customType=custom%2Fnone&from=704_large_image_list&imageType=video1609&isImmersiveScene=0&is_stream=0&logId=202309132134234286F774B5273B4C0A5F&requestFrom=704&x-expires=1726148064&x-signature=ReDy6AL8DMvD7YsUrl%2F%2Bl2wb6Ls%3D +``` + +考虑到图片网址可能会有实效性,自动抓取程序将把它下载下来保存为:20230913001.jpg, +则封面图描述文件20230913001_cover.txt内容为: +``` +./20230913001.jpg +``` + diff --git a/todo/README.md b/todo/README.md new file mode 100644 index 0000000..0f1426b --- /dev/null +++ b/todo/README.md @@ -0,0 +1,14 @@ + +## 待抓取网页任务目录 + +每个.todo文件视为一个待处理的任务,文件内容为待抓取的视频网址。 + +自动处理程序在完成任务后,将删除任务文件,并将抓取数据结果保存到config.mjs里的数据保存目录中。 + +任务文件内容示例: +``` +https://v.ixigua.com/ieUaqrFN/ +``` + +文件名可以时间戳命名。 +