|
|
@ -3,6 +3,7 @@ |
|
|
|
* 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。 |
|
|
|
* 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。 |
|
|
|
* 增加失败任务的重试机制 |
|
|
|
* 增加失败任务的重试机制 |
|
|
|
* 增加失败任务上报 |
|
|
|
* 增加失败任务上报 |
|
|
|
|
|
|
|
* 增加任务处理超时 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
import getConfigs from './config.mjs'; |
|
|
|
import getConfigs from './config.mjs'; |
|
|
|
import common from './lib/common.mjs'; |
|
|
|
import common from './lib/common.mjs'; |
|
|
@ -16,6 +17,7 @@ import Xigua from './bot/Xigua.mjs'; |
|
|
|
import Bilibili from './bot/Bilibili.mjs'; |
|
|
|
import Bilibili from './bot/Bilibili.mjs'; |
|
|
|
|
|
|
|
|
|
|
|
import cron from 'node-cron'; |
|
|
|
import cron from 'node-cron'; |
|
|
|
|
|
|
|
import path from 'node:path'; |
|
|
|
|
|
|
|
|
|
|
|
(async () => { |
|
|
|
(async () => { |
|
|
|
//设置configs为全局变量
|
|
|
|
//设置configs为全局变量
|
|
|
@ -45,14 +47,23 @@ import cron from 'node-cron'; |
|
|
|
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; |
|
|
|
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; |
|
|
|
|
|
|
|
|
|
|
|
//spider run
|
|
|
|
//spider run
|
|
|
|
let spider_is_running = false; |
|
|
|
let spider_is_running = false, |
|
|
|
|
|
|
|
last_run_time = 0; |
|
|
|
const task_check_time = 20; //每 20 秒抓取一次
|
|
|
|
const task_check_time = 20; //每 20 秒抓取一次
|
|
|
|
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { |
|
|
|
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { |
|
|
|
if (spider_is_running == true) {return false;} //避免同时执行多个爬虫任务
|
|
|
|
const current_time = common.getTimestampInSeconds(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//避免同时执行多个爬虫任务,并检查上个任务执行是否超时
|
|
|
|
|
|
|
|
if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) { |
|
|
|
|
|
|
|
return false; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const task = taskMoniter.getNewTask(); |
|
|
|
const task = taskMoniter.getNewTask(); |
|
|
|
if (!task) {return false;} |
|
|
|
if (!task) {return false;} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`; |
|
|
|
|
|
|
|
await common.saveLog(logFile, JSON.stringify(task) + "\n"); |
|
|
|
|
|
|
|
|
|
|
|
const botName = common.getBotName(task.url); |
|
|
|
const botName = common.getBotName(task.url); |
|
|
|
console.log('New task %s handle by bot %s.', task.url, botName); |
|
|
|
console.log('New task %s handle by bot %s.', task.url, botName); |
|
|
|
let bot = null; |
|
|
|
let bot = null; |
|
|
@ -73,6 +84,7 @@ import cron from 'node-cron'; |
|
|
|
|
|
|
|
|
|
|
|
if (bot) { |
|
|
|
if (bot) { |
|
|
|
spider_is_running = true; |
|
|
|
spider_is_running = true; |
|
|
|
|
|
|
|
last_run_time = common.getTimestampInSeconds(); |
|
|
|
|
|
|
|
|
|
|
|
taskMoniter.setTaskRunning(task.id); |
|
|
|
taskMoniter.setTaskRunning(task.id); |
|
|
|
const data = await bot.scrap(task.url); |
|
|
|
const data = await bot.scrap(task.url); |
|
|
|