Browse Source

add task log, max run time

master
filesite 7 months ago
parent
commit
8a01d354c3
  1. 2
      config.mjs
  2. 19
      lib/common.mjs
  3. 7
      lib/taskMoniter.mjs
  4. 5
      log/Readme.md
  5. 16
      spider.mjs

2
config.mjs

@ -4,7 +4,9 @@ let configs = {
//自动任务相关配置 //自动任务相关配置
task_list_dir: 'todo/', //待抓取任务文件保存目录 task_list_dir: 'todo/', //待抓取任务文件保存目录
data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io
task_log_dir: 'log/', //新任务日志保存目录,方便跟踪和分析任务
task_do_timeout: 180, //任务处理超时时长,单位:秒
max_fail_retry: 5, //任务失败最多重试次数 max_fail_retry: 5, //任务失败最多重试次数
//HeroUnion英雄联盟对接配置 //HeroUnion英雄联盟对接配置

19
lib/common.mjs

@ -1,6 +1,5 @@
//公用方法 //公用方法
import { rm as removeFile } from 'node:fs/promises'; import { readdir, readFile, rm as removeFile, appendFile } from 'node:fs/promises';
import { readdir, readFile } from 'node:fs/promises';
import path from 'node:path'; import path from 'node:path';
export default { export default {
@ -91,4 +90,20 @@ export default {
return configs; return configs;
} }
//保存log到指定文件
async saveLog(filePath, content) {
let saved = false;
try {
let saveRes = await appendFile(filePath, content);
if (saveRes == undefined) {
saved = true;
}
} catch (err) {
console.error(`Log save to %s failed: %s`, filePath, err.message);
}
return saved;
}
}; };

7
lib/taskMoniter.mjs

@ -89,6 +89,7 @@ class TaskMoniter {
} }
this.tasks[task_id].status = this.statusCode.running; this.tasks[task_id].status = this.statusCode.running;
this.tasks[task_id].updated = common.getTimestampInSeconds();
this.taskStatus[this.statusCode.running] ++; this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --; this.taskStatus[this.statusCode.waiting] --;
@ -100,9 +101,10 @@ class TaskMoniter {
return false; return false;
} }
this.tasks[task_id].status = this.statusCode.waiting;
this.tasks[task_id].updated = common.getTimestampInSeconds();
this.taskStatus[this.tasks[task_id].status] --; this.taskStatus[this.tasks[task_id].status] --;
this.taskStatus[this.statusCode.waiting] ++; this.taskStatus[this.statusCode.waiting] ++;
this.tasks[task_id].status = this.statusCode.waiting;
return true; return true;
} }
@ -113,6 +115,7 @@ class TaskMoniter {
} }
this.tasks[task_id].status = this.statusCode.done; this.tasks[task_id].status = this.statusCode.done;
this.tasks[task_id].updated = common.getTimestampInSeconds();
this.taskStatus[this.statusCode.done] ++; this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --; this.taskStatus[this.statusCode.running] --;
@ -131,6 +134,7 @@ class TaskMoniter {
} }
this.tasks[task_id].status = this.statusCode.failed; this.tasks[task_id].status = this.statusCode.failed;
this.tasks[task_id].updated = common.getTimestampInSeconds();
this.taskStatus[this.statusCode.failed] ++; this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --; this.taskStatus[this.statusCode.running] --;
@ -143,6 +147,7 @@ class TaskMoniter {
try { try {
task.id = this.getTaskId(filename); task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting; task.status = this.statusCode.waiting;
task.created = common.getTimestampInSeconds();
task.url = await readFile(filepath, { encoding: 'utf8' }); task.url = await readFile(filepath, { encoding: 'utf8' });
if (task.url) { if (task.url) {

5
log/Readme.md

@ -0,0 +1,5 @@
# log dir
Save log of new task.

16
spider.mjs

@ -3,6 +3,7 @@
* 负责监听任务目录里的新任务并自动抓取数据保存到数据目录 * 负责监听任务目录里的新任务并自动抓取数据保存到数据目录
* 增加失败任务的重试机制 * 增加失败任务的重试机制
* 增加失败任务上报 * 增加失败任务上报
* 增加任务处理超时
*/ */
import getConfigs from './config.mjs'; import getConfigs from './config.mjs';
import common from './lib/common.mjs'; import common from './lib/common.mjs';
@ -16,6 +17,7 @@ import Xigua from './bot/Xigua.mjs';
import Bilibili from './bot/Bilibili.mjs'; import Bilibili from './bot/Bilibili.mjs';
import cron from 'node-cron'; import cron from 'node-cron';
import path from 'node:path';
(async () => { (async () => {
//设置configs为全局变量 //设置configs为全局变量
@ -45,14 +47,23 @@ import cron from 'node-cron';
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : '';
//spider run //spider run
let spider_is_running = false; let spider_is_running = false,
last_run_time = 0;
const task_check_time = 20; //每 20 秒抓取一次 const task_check_time = 20; //每 20 秒抓取一次
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
if (spider_is_running == true) {return false;} //避免同时执行多个爬虫任务 const current_time = common.getTimestampInSeconds();
//避免同时执行多个爬虫任务,并检查上个任务执行是否超时
if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) {
return false;
}
const task = taskMoniter.getNewTask(); const task = taskMoniter.getNewTask();
if (!task) {return false;} if (!task) {return false;}
let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`;
await common.saveLog(logFile, JSON.stringify(task) + "\n");
const botName = common.getBotName(task.url); const botName = common.getBotName(task.url);
console.log('New task %s handle by bot %s.', task.url, botName); console.log('New task %s handle by bot %s.', task.url, botName);
let bot = null; let bot = null;
@ -73,6 +84,7 @@ import cron from 'node-cron';
if (bot) { if (bot) {
spider_is_running = true; spider_is_running = true;
last_run_time = common.getTimestampInSeconds();
taskMoniter.setTaskRunning(task.id); taskMoniter.setTaskRunning(task.id);
const data = await bot.scrap(task.url); const data = await bot.scrap(task.url);

Loading…
Cancel
Save