From 87754b28a03779b73c00ec911ad03e074c5b0520 Mon Sep 17 00:00:00 2001 From: filesite Date: Fri, 12 Apr 2024 18:20:46 +0800 Subject: [PATCH] code reindent, add config for herounion --- README.md | 20 +++- config.mjs | 92 ++++++++------- lib/taskMoniter.mjs | 265 +++++++++++++++++++++---------------------- package.json | 20 ++-- spider.mjs | 58 +++++----- test/start_cloud.mjs | 18 +-- test/tajia_test.mjs | 16 +-- 7 files changed, 258 insertions(+), 231 deletions(-) diff --git a/README.md b/README.md index 516e337..0c99b6c 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,14 @@ Hero scripts of machete. ## 使用方法 -1. 下载本源码到本地后,进入项目根目录; +1. 下载本源码到本地后,进入项目根目录 ``` git clone "https://git.filesite.io/filesite/machete_hero.git" cd machete_hero/ ``` -2. 执行下面命令安装依赖包: + +2. 执行下面命令安装依赖包 ``` npm install ``` @@ -51,8 +52,19 @@ npm install 如果你对npm和node不熟悉,请自行了解。 -3. 写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。 +3. 执行下面命令启动爬虫 +``` +npm start +``` + +在目录todo/里创建任务文件,爬虫检测到新任务后自动抓取数据并保存到data/目录下。 + + +4. 二次开发 + +写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。 + +还可以参考bot/下的类库,实现对任意网站的数据抓取。 bot/目录下的类库调用方法,可参考test/scrap_test.mjs测试脚本, 测试脚本使用方法见test/README.md文档。 - diff --git a/config.mjs b/config.mjs index bdf771e..9712212 100644 --- a/config.mjs +++ b/config.mjs @@ -1,45 +1,57 @@ export default { - //自动任务相关配置 - task_list_dir: 'todo/', //待抓取任务文件保存目录 - data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io - - - //bot相关配置 - //userAgent: '~ chrome >= 114 && mac', //指定操作系统和浏览器版本 - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', - viewport: { - width: 1440, - height: 900 - }, - - viewports: { - mob: { - width: 375, - height: 667 + //自动任务相关配置 + task_list_dir: 'todo/', //待抓取任务文件保存目录 + data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io + + + //herounion对接配置 + herounion: { + name: 'machete_hero', //爬虫名字 + description: '支持Machete的TaJian皮肤的hero爬虫', //爬虫简介 + platforms: 'douyin,kuaishou,xigua,bilibili', //爬虫支持的平台 + contracts: 'tajiantv', //爬虫支持的数据采集合约(可二次开发自定义) + country: 'cn', //爬虫所在国家 + lang: 'zh', //爬虫支持的语言 + contact: 'https://filesite.io', //爬虫的联系方式 }, - pc: { + + + //bot相关配置 + cloud_server: 'ws://192.168.3.13:1818', + userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + viewport: { width: 1440, height: 900 }, - }, - - //请求参数 - heroBotOptions: { - timeoutMs: 20000, - referrer: '', - }, - - //网页tab参数 - heroTabOptions: { - timeoutMs: 30000 - }, - - //常用浏览器user-agent - userAgents: { - iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', - iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', - mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', - android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', - } - -}; + + viewports: { + mob: { + width: 375, + height: 667 + }, + pc: { + width: 1440, + height: 900 + }, + }, + + //请求参数 + heroBotOptions: { + timeoutMs: 20000, + referrer: '', + }, + + //网页tab参数 + heroTabOptions: { + timeoutMs: 30000 + }, + + //常用浏览器user-agent + userAgents: { + iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', + iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', + mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', + } + +}; \ No newline at end of file diff --git a/lib/taskMoniter.mjs b/lib/taskMoniter.mjs index 29a94b5..786ed2f 100644 --- a/lib/taskMoniter.mjs +++ b/lib/taskMoniter.mjs @@ -16,175 +16,172 @@ import path from 'node:path'; import cron from 'node-cron'; class TaskMoniter { + constructor(task_list_dir) { + this.check_time_gap = 30; //检测间隔时间,单位:秒 + this.checking = false; - constructor(task_list_dir) { - this.check_time_gap = 30; //检测间隔时间,单位:秒 - this.checking = false; - - this.task_dir = task_list_dir; //监控目录:任务列表保存目录 - this.tasks = {}; //内存中的任务列表 - this.taskStatus = { //当前任务状态 - total: 0, //总任务数 - waiting: 0, //等待执行的任务数 - running: 0, //正在执行的任务数 - done: 0, //已完成的任务数 - failed: 0 //执行失败的任务数 - }; - - this.statusCode = { - waiting: 'waiting', - running: 'running', - done: 'done', - failed: 'failed', - }; - } - - getTaskFilePath(task_id) { - const dirPath = path.resolve(this.task_dir); - return `${dirPath}/${task_id}.task`; - } - - //注意:任务文件名不能重复,已经用过的文件名不能再使用 - //推荐以时间戳为任务文件名,如:1694762776985.task - getTaskId(filename) { - return filename.replace('.task', ''); - } - - getStatus() { - return this.taskStatus; - } - - getNewTask() { - let task = null; - - for (const id in this.tasks) { - if (this.tasks[id].status == this.statusCode.waiting) { - task = this.tasks[id]; - break; - } + this.task_dir = task_list_dir; //监控目录:任务列表保存目录 + this.tasks = {}; //内存中的任务列表 + this.taskStatus = { //当前任务状态 + total: 0, //总任务数 + waiting: 0, //等待执行的任务数 + running: 0, //正在执行的任务数 + done: 0, //已完成的任务数 + failed: 0 //执行失败的任务数 + }; + + this.statusCode = { + waiting: 'waiting', + running: 'running', + done: 'done', + failed: 'failed', + }; } - return task; - } - - setTaskRunning(task_id) { - if (typeof(this.tasks[task_id]) == 'undefined') { - return false; + getTaskFilePath(task_id) { + const dirPath = path.resolve(this.task_dir); + return `${dirPath}/${task_id}.task`; } - this.tasks[task_id].status = this.statusCode.running; - this.taskStatus[this.statusCode.running] ++; - this.taskStatus[this.statusCode.waiting] --; - - return true; - } + //注意:任务文件名不能重复,已经用过的文件名不能再使用 + //推荐以时间戳为任务文件名,如:1694762776985.task + getTaskId(filename) { + return filename.replace('.task', ''); + } - setTaskDone(task_id) { - if (typeof(this.tasks[task_id]) == 'undefined') { - return false; + getStatus() { + return this.taskStatus; } - this.tasks[task_id].status = this.statusCode.done; - this.taskStatus[this.statusCode.done] ++; - this.taskStatus[this.statusCode.running] --; + getNewTask() { + let task = null; - const filepath = this.getTaskFilePath(task_id); - common.removeFile(filepath); //async delete - return true; - } + for (const id in this.tasks) { + if (this.tasks[id].status == this.statusCode.waiting) { + task = this.tasks[id]; + break; + } + } - setTaskFailed(task_id) { - if (typeof(this.tasks[task_id]) == 'undefined') { - return false; + return task; } - this.tasks[task_id].status = this.statusCode.failed; - this.taskStatus[this.statusCode.failed] ++; - this.taskStatus[this.statusCode.running] --; - - return true; - } + setTaskRunning(task_id) { + if (typeof(this.tasks[task_id]) == 'undefined') { + return false; + } - async parseTaskFile(filename, filepath) { - let task = {}; + this.tasks[task_id].status = this.statusCode.running; + this.taskStatus[this.statusCode.running] ++; + this.taskStatus[this.statusCode.waiting] --; - try { - task.id = this.getTaskId(filename); - task.status = this.statusCode.waiting; + return true; + } - task.url = await readFile(filepath, { encoding: 'utf8' }); - if (task.url) { - task.url = task.url.replace(/[\r\n]/g, ''); + setTaskDone(task_id) { + if (typeof(this.tasks[task_id]) == 'undefined') { + return false; } - }catch(error) { - console.error('Get task file content failed: %s', error); - } - return task; - } + this.tasks[task_id].status = this.statusCode.done; + this.taskStatus[this.statusCode.done] ++; + this.taskStatus[this.statusCode.running] --; - addTask(task) { - if (typeof(this.tasks[task.id]) != 'undefined') { - return false; + const filepath = this.getTaskFilePath(task_id); + common.removeFile(filepath); //async delete + return true; } - this.tasks[task.id] = task; - this.taskStatus[task.status] ++; - this.taskStatus.total ++; + setTaskFailed(task_id) { + if (typeof(this.tasks[task_id]) == 'undefined') { + return false; + } - return true; - } + this.tasks[task_id].status = this.statusCode.failed; + this.taskStatus[this.statusCode.failed] ++; + this.taskStatus[this.statusCode.running] --; - async checkTasks() { - if (this.checking == true) { - return; + return true; } - try { - console.log('[%s] TaskMoniter auto check...', common.getTimeString()); + async parseTaskFile(filename, filepath) { + let task = {}; - this.checking = true; + try { + task.id = this.getTaskId(filename); + task.status = this.statusCode.waiting; - const dirPath = path.resolve(this.task_dir); - const files = await readdir(dirPath); - let task = null, task_id = null; - for (const filename of files) { - if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files - - task_id = this.getTaskId(filename); - if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务 - continue; + task.url = await readFile(filepath, { encoding: 'utf8' }); + if (task.url) { + task.url = task.url.replace(/[\r\n]/g, ''); } + }catch(error) { + console.error('Get task file content failed: %s', error); + } + + return task; + } - task = await this.parseTaskFile(filename, `${dirPath}/${filename}`); - this.addTask(task); + addTask(task) { + if (typeof(this.tasks[task.id]) != 'undefined') { + return false; } - this.checking = false; - }catch(error) { - this.checking = false; - console.error('Check tasks failed: %s', error); + this.tasks[task.id] = task; + this.taskStatus[task.status] ++; + this.taskStatus.total ++; + + return true; } - } - run() { //开始监控任务目录,把所有任务缓存到内存 - console.log('[%s] TaskMoniter started.', common.getTimeString()); + async checkTasks() { + if (this.checking == true) { + return; + } + + try { + console.log('[%s] TaskMoniter auto check...', common.getTimeString()); + + this.checking = true; - //auto check new tasks - const _self = this; - const task_check_time = this.check_time_gap; - const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { - await _self.checkTasks(); - console.log('Status', _self.getStatus()); - }, { - scheduled: false - }); + const dirPath = path.resolve(this.task_dir); + const files = await readdir(dirPath); + let task = null, task_id = null; + for (const filename of files) { + if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files - task_auto_run.start(); - console.log('[%s] TaskMoniter auto check started.', common.getTimeString()); + task_id = this.getTaskId(filename); + if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务 + continue; + } - } + task = await this.parseTaskFile(filename, `${dirPath}/${filename}`); + this.addTask(task); + } + this.checking = false; + }catch(error) { + this.checking = false; + console.error('Check tasks failed: %s', error); + } + } + + run() { //开始监控任务目录,把所有任务缓存到内存 + console.log('[%s] TaskMoniter started.', common.getTimeString()); + + //auto check new tasks + const _self = this; + const task_check_time = this.check_time_gap; + const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { + await _self.checkTasks(); + console.log('Status', _self.getStatus()); + }, { + scheduled: false + }); + + task_auto_run.start(); + console.log('[%s] TaskMoniter auto check started.', common.getTimeString()); + } } -export default TaskMoniter; +export default TaskMoniter; \ No newline at end of file diff --git a/package.json b/package.json index 4fb0129..a23aebd 100644 --- a/package.json +++ b/package.json @@ -1,9 +1,13 @@ { - "name": "machege-hero", - "type": "module", - "dependencies": { - "@ulixee/cloud": "^2.0.0-alpha.24", - "@ulixee/hero": "^2.0.0-alpha.24", - "node-cron": "^3.0.2" - } -} + "name": "machege-hero", + "type": "module", + "dependencies": { + "@ulixee/cloud": "^2.0.0-alpha.24", + "@ulixee/hero": "^2.0.0-alpha.24", + "node-cron": "^3.0.2", + "axios": "^1.3.3" + }, + "scripts": { + "start": "node spider.mjs" + } +} \ No newline at end of file diff --git a/spider.mjs b/spider.mjs index 5c476f5..f9f1f6e 100644 --- a/spider.mjs +++ b/spider.mjs @@ -21,8 +21,8 @@ import cron from 'node-cron'; taskMoniter.run(); //监控新任务 - - const heroCloudServer = 'ws://192.168.3.13:1818'; + //配置本地cloud server地址,cloud安装参考:./install_cloud.sh + const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; //spider run let spider_is_running = false; @@ -37,18 +37,18 @@ import cron from 'node-cron'; console.log('New task %s handle by bot %s.', task.url, botName); let bot = null; switch (botName) { - case 'douyin': - bot = new Douyin(heroCloudServer); - break; - case 'kuaishou': - bot = new Kuaishou(heroCloudServer); - break; - case 'xigua': - bot = new Xigua(heroCloudServer); - break; - case 'bilibili': - bot = new Bilibili(heroCloudServer); - break; + case 'douyin': + bot = new Douyin(heroCloudServer); + break; + case 'kuaishou': + bot = new Kuaishou(heroCloudServer); + break; + case 'xigua': + bot = new Xigua(heroCloudServer); + break; + case 'bilibili': + bot = new Bilibili(heroCloudServer); + break; } if (bot) { @@ -62,29 +62,31 @@ import cron from 'node-cron'; if ( await tajian.saveUrlShortcut(task.id, data) && await tajian.saveDescriptionFiles(task.id, data) - ) { + ) { taskMoniter.setTaskDone(task.id); - }else { - taskMoniter.setTaskFailed(task.id); - } }else { taskMoniter.setTaskFailed(task.id); } - - spider_is_running = false; }else { - console.error('No bot matched with url %s', task.url); - taskMoniter.setTaskRunning(task.id); taskMoniter.setTaskFailed(task.id); } - }, { - scheduled: false - }); + + spider_is_running = false; + }else { + console.error('No bot matched with url %s', task.url); + taskMoniter.setTaskRunning(task.id); + taskMoniter.setTaskFailed(task.id); + } +}, { + scheduled: false +}); task_auto_run.start(); console.log('[%s] Spider started.', common.getTimeString()); + //TODO: 对接英雄联盟接口:https://herounion.filesite.io + })().catch(error => { - console.error("Spider error got:\n%s", error); - process.exit(1); -}); + console.error("Spider error got:\n%s", error); + process.exit(1); +}); \ No newline at end of file diff --git a/test/start_cloud.mjs b/test/start_cloud.mjs index 5f1993f..70d6265 100644 --- a/test/start_cloud.mjs +++ b/test/start_cloud.mjs @@ -2,13 +2,13 @@ import { CloudNode } from '@ulixee/cloud'; (async () => { - const cloudNode = new CloudNode({ - port: 1818, - cloudType: 'private' - }); - await cloudNode.listen(); - console.log(`CloudNode started on port ${await cloudNode.port}`); + const cloudNode = new CloudNode({ + port: 1818, + cloudType: 'private' + }); + await cloudNode.listen(); + console.log(`CloudNode started on port ${await cloudNode.port}`); })().catch(error => { - console.log('ERROR starting Ulixee CloudNode', error); - process.exit(1); -}); + console.log('ERROR starting Ulixee CloudNode', error); + process.exit(1); +}); \ No newline at end of file diff --git a/test/tajia_test.mjs b/test/tajia_test.mjs index f0045d4..354fb5b 100644 --- a/test/tajia_test.mjs +++ b/test/tajia_test.mjs @@ -3,11 +3,11 @@ import TaJian from '../lib/tajian.mjs'; (async () => { const data = { - url: 'https://v.douyin.com/i8sEyb6/', - done: true, - bot: 'douyin', - title: '自由与成功 - 抖音', - cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D' + url: 'https://v.douyin.com/i8sEyb6/', + done: true, + bot: 'douyin', + title: '自由与成功 - 抖音', + cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D' }; const filename = 'douyintest'; @@ -20,6 +20,6 @@ import TaJian from '../lib/tajian.mjs'; console.log('descriptions save done', saveDescDone); })().catch(error => { - console.error("Error got:\n%s", error); - process.exit(1); -}); + console.error("Error got:\n%s", error); + process.exit(1); +}); \ No newline at end of file