|
|
|
/**
|
|
|
|
* 爬虫主程序
|
|
|
|
* 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。
|
|
|
|
* 增加失败任务的重试机制
|
|
|
|
* 增加失败任务上报
|
|
|
|
* 增加任务处理超时
|
|
|
|
*/
|
|
|
|
import getConfigs from './config.mjs';
|
|
|
|
import common from './lib/common.mjs';
|
|
|
|
import TaskMoniter from "./lib/taskMoniter.mjs";
|
|
|
|
import TaJian from "./lib/tajian.mjs";
|
|
|
|
import HeroBot from "./lib/heroBot.mjs";
|
|
|
|
|
|
|
|
import Douyin from './bot/Douyin.mjs';
|
|
|
|
import Kuaishou from './bot/Kuaishou.mjs';
|
|
|
|
import Xigua from './bot/Xigua.mjs';
|
|
|
|
import Bilibili from './bot/Bilibili.mjs';
|
|
|
|
import WebCrawler from './bot/WebCrawler.mjs';
|
|
|
|
|
|
|
|
import cron from 'node-cron';
|
|
|
|
import path from 'node:path';
|
|
|
|
|
|
|
|
(async () => {
|
|
|
|
//设置configs为全局变量
|
|
|
|
let configFile = '';
|
|
|
|
if (process.argv.length >= 3) {
|
|
|
|
configFile = process.argv[2];
|
|
|
|
}
|
|
|
|
global.configs = await getConfigs(configFile);
|
|
|
|
|
|
|
|
const taskMoniter = new TaskMoniter(configs.task_list_dir);
|
|
|
|
const tajian = new TaJian(configs.data_save_dir);
|
|
|
|
|
|
|
|
taskMoniter.run(); //监控新任务
|
|
|
|
|
|
|
|
//HeroUnion英雄联盟对接
|
|
|
|
let heroUnionConfig = configs.herounion;
|
|
|
|
let heroBot = new HeroBot(
|
|
|
|
heroUnionConfig.server_url,
|
|
|
|
heroUnionConfig.name,
|
|
|
|
heroUnionConfig.description,
|
|
|
|
heroUnionConfig.platforms,
|
|
|
|
heroUnionConfig.contracts,
|
|
|
|
heroUnionConfig.country,
|
|
|
|
heroUnionConfig.lang,
|
|
|
|
heroUnionConfig.contact,
|
|
|
|
heroUnionConfig.data_mode
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
//配置本地cloud server地址,cloud安装参考:./install_cloud.sh
|
|
|
|
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : '';
|
|
|
|
|
|
|
|
//spider run
|
|
|
|
let spider_is_running = false,
|
|
|
|
last_run_time = 0;
|
|
|
|
const task_check_time = 20; //每 20 秒抓取一次
|
|
|
|
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
|
|
|
|
const current_time = common.getTimestampInSeconds();
|
|
|
|
|
|
|
|
//避免同时执行多个爬虫任务,并检查上个任务执行是否超时
|
|
|
|
if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
//随机延迟一段时间,将不同爬虫的执行时间错开
|
|
|
|
let rnd_secods = parseInt(Math.random() * task_check_time);
|
|
|
|
console.log("Sleep %s seconds before crap...", rnd_secods);
|
|
|
|
await common.delay(rnd_secods);
|
|
|
|
|
|
|
|
|
|
|
|
const task = taskMoniter.getNewTask();
|
|
|
|
if (!task) {return false;}
|
|
|
|
|
|
|
|
//标记爬虫开始执行任务
|
|
|
|
spider_is_running = true;
|
|
|
|
last_run_time = common.getTimestampInSeconds();
|
|
|
|
|
|
|
|
let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`;
|
|
|
|
await common.saveLog(logFile, JSON.stringify(task) + "\n");
|
|
|
|
|
|
|
|
const botName = common.getBotName(task.url);
|
|
|
|
console.log('New task %s handle by bot %s, url: %s, cloud server: %s', task.id, botName, task.url, heroCloudServer);
|
|
|
|
let bot = null;
|
|
|
|
switch (botName) {
|
|
|
|
case 'douyin':
|
|
|
|
bot = new Douyin(heroCloudServer);
|
|
|
|
bot.setMode('mob'); //使用手机模式
|
|
|
|
break;
|
|
|
|
case 'kuaishou':
|
|
|
|
bot = new Kuaishou(heroCloudServer);
|
|
|
|
break;
|
|
|
|
case 'xigua':
|
|
|
|
bot = new Xigua(heroCloudServer);
|
|
|
|
break;
|
|
|
|
case 'bilibili':
|
|
|
|
bot = new Bilibili(heroCloudServer);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
bot = new WebCrawler(heroCloudServer, botName);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bot) {
|
|
|
|
console.log('Spider craping...');
|
|
|
|
|
|
|
|
let taskStarted = taskMoniter.setTaskRunning(task.id);
|
|
|
|
const data = await bot.scrap(task.url);
|
|
|
|
//console.log('Data got by bot', data);
|
|
|
|
|
|
|
|
if (typeof(data.done) != 'undefined' && data.done == true) {
|
|
|
|
task.data = data; //把抓取到的数据保存到任务里
|
|
|
|
taskMoniter.updateTask(task.id, task);
|
|
|
|
|
|
|
|
if (
|
|
|
|
await tajian.saveUrlShortcut(task.id, data)
|
|
|
|
&& await tajian.saveDescriptionFiles(task.id, data)
|
|
|
|
) {
|
|
|
|
//马上回传一次数据
|
|
|
|
taskMoniter.notifyHandle(task);
|
|
|
|
|
|
|
|
//标记任务完成
|
|
|
|
taskMoniter.setTaskDone(task.id);
|
|
|
|
}else {
|
|
|
|
taskMoniter.setTaskFailed(task.id);
|
|
|
|
}
|
|
|
|
}else {
|
|
|
|
//失败后最多重试 5 次
|
|
|
|
if (typeof(task.fail_retry) == 'undefined') {
|
|
|
|
task.fail_retry = 0;
|
|
|
|
}else {
|
|
|
|
task.fail_retry ++;
|
|
|
|
}
|
|
|
|
|
|
|
|
taskMoniter.updateTask(task.id, task);
|
|
|
|
|
|
|
|
if (task.fail_retry > configs.max_fail_retry) {
|
|
|
|
taskMoniter.setTaskFailed(task.id);
|
|
|
|
|
|
|
|
//上报联盟,任务失败
|
|
|
|
heroBot.saveTaskData(task.id, task.token, [], 'failed');
|
|
|
|
}else {
|
|
|
|
taskMoniter.setTaskWaiting(task.id); //重新进入等待处理状态
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
spider_is_running = false;
|
|
|
|
}else {
|
|
|
|
console.error('No bot matched with url %s', task.url);
|
|
|
|
spider_is_running = false;
|
|
|
|
taskMoniter.setTaskFailed(task.id);
|
|
|
|
}
|
|
|
|
}, {
|
|
|
|
scheduled: false
|
|
|
|
});
|
|
|
|
|
|
|
|
task_auto_run.start();
|
|
|
|
console.log('[%s] Spider started.', common.getTimeString());
|
|
|
|
|
|
|
|
|
|
|
|
//爬虫心跳上报
|
|
|
|
const heartBeatFrequence = 5; //5 分钟上报一次
|
|
|
|
const heroUnionHeartBeat = cron.schedule(`*/${heartBeatFrequence} * * * *`, async () => {
|
|
|
|
//随机延迟一段时间,将不同爬虫的执行时间错开
|
|
|
|
let rnd_secods = parseInt(Math.random() * 60);
|
|
|
|
console.log("Sleep %s seconds before send heart beat...", rnd_secods);
|
|
|
|
await common.delay(rnd_secods);
|
|
|
|
|
|
|
|
let status = spider_is_running ? 'busy' : 'idle';
|
|
|
|
const res = await heroBot.heartBeat(status);
|
|
|
|
console.log('HeroUnion bot heart beat result', res);
|
|
|
|
}, {scheduled: false});
|
|
|
|
heroUnionHeartBeat.start();
|
|
|
|
|
|
|
|
let heartBeatRes = await heroBot.heartBeat('idle'); //马上上报一次
|
|
|
|
console.log('[%s] HeroUnion bot heart beat started.', common.getTimeString(), heartBeatRes);
|
|
|
|
})().catch(error => {
|
|
|
|
console.error("Spider error got:\n%s", error);
|
|
|
|
process.exit(1);
|
|
|
|
});
|