diff --git a/config.mjs b/config.mjs index 3bb39b9..88c1318 100644 --- a/config.mjs +++ b/config.mjs @@ -4,7 +4,9 @@ let configs = { //自动任务相关配置 task_list_dir: 'todo/', //待抓取任务文件保存目录 data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io + task_log_dir: 'log/', //新任务日志保存目录,方便跟踪和分析任务 + task_do_timeout: 180, //任务处理超时时长,单位:秒 max_fail_retry: 5, //任务失败最多重试次数 //HeroUnion英雄联盟对接配置 diff --git a/lib/common.mjs b/lib/common.mjs index be04798..dc5809a 100644 --- a/lib/common.mjs +++ b/lib/common.mjs @@ -1,6 +1,5 @@ //公用方法 -import { rm as removeFile } from 'node:fs/promises'; -import { readdir, readFile } from 'node:fs/promises'; +import { readdir, readFile, rm as removeFile, appendFile } from 'node:fs/promises'; import path from 'node:path'; export default { @@ -91,4 +90,20 @@ export default { return configs; } + //保存log到指定文件 + async saveLog(filePath, content) { + let saved = false; + + try { + let saveRes = await appendFile(filePath, content); + if (saveRes == undefined) { + saved = true; + } + } catch (err) { + console.error(`Log save to %s failed: %s`, filePath, err.message); + } + + return saved; + } + }; diff --git a/lib/taskMoniter.mjs b/lib/taskMoniter.mjs index b1f8bf9..330cd3a 100644 --- a/lib/taskMoniter.mjs +++ b/lib/taskMoniter.mjs @@ -89,6 +89,7 @@ class TaskMoniter { } this.tasks[task_id].status = this.statusCode.running; + this.tasks[task_id].updated = common.getTimestampInSeconds(); this.taskStatus[this.statusCode.running] ++; this.taskStatus[this.statusCode.waiting] --; @@ -100,9 +101,10 @@ class TaskMoniter { return false; } + this.tasks[task_id].status = this.statusCode.waiting; + this.tasks[task_id].updated = common.getTimestampInSeconds(); this.taskStatus[this.tasks[task_id].status] --; this.taskStatus[this.statusCode.waiting] ++; - this.tasks[task_id].status = this.statusCode.waiting; return true; } @@ -113,6 +115,7 @@ class TaskMoniter { } this.tasks[task_id].status = this.statusCode.done; + this.tasks[task_id].updated = common.getTimestampInSeconds(); this.taskStatus[this.statusCode.done] ++; this.taskStatus[this.statusCode.running] --; @@ -131,6 +134,7 @@ class TaskMoniter { } this.tasks[task_id].status = this.statusCode.failed; + this.tasks[task_id].updated = common.getTimestampInSeconds(); this.taskStatus[this.statusCode.failed] ++; this.taskStatus[this.statusCode.running] --; @@ -143,6 +147,7 @@ class TaskMoniter { try { task.id = this.getTaskId(filename); task.status = this.statusCode.waiting; + task.created = common.getTimestampInSeconds(); task.url = await readFile(filepath, { encoding: 'utf8' }); if (task.url) { diff --git a/log/Readme.md b/log/Readme.md new file mode 100644 index 0000000..094d746 --- /dev/null +++ b/log/Readme.md @@ -0,0 +1,5 @@ + +# log dir + +Save log of new task. + diff --git a/spider.mjs b/spider.mjs index af1a77e..76b6c80 100644 --- a/spider.mjs +++ b/spider.mjs @@ -3,6 +3,7 @@ * 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。 * 增加失败任务的重试机制 * 增加失败任务上报 + * 增加任务处理超时 */ import getConfigs from './config.mjs'; import common from './lib/common.mjs'; @@ -16,6 +17,7 @@ import Xigua from './bot/Xigua.mjs'; import Bilibili from './bot/Bilibili.mjs'; import cron from 'node-cron'; +import path from 'node:path'; (async () => { //设置configs为全局变量 @@ -45,14 +47,23 @@ import cron from 'node-cron'; const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; //spider run - let spider_is_running = false; + let spider_is_running = false, + last_run_time = 0; const task_check_time = 20; //每 20 秒抓取一次 const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { - if (spider_is_running == true) {return false;} //避免同时执行多个爬虫任务 + const current_time = common.getTimestampInSeconds(); + + //避免同时执行多个爬虫任务,并检查上个任务执行是否超时 + if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) { + return false; + } const task = taskMoniter.getNewTask(); if (!task) {return false;} + let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`; + await common.saveLog(logFile, JSON.stringify(task) + "\n"); + const botName = common.getBotName(task.url); console.log('New task %s handle by bot %s.', task.url, botName); let bot = null; @@ -73,6 +84,7 @@ import cron from 'node-cron'; if (bot) { spider_is_running = true; + last_run_time = common.getTimestampInSeconds(); taskMoniter.setTaskRunning(task.id); const data = await bot.scrap(task.url);