You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 lines
6.5 KiB
181 lines
6.5 KiB
/** |
|
* 爬虫主程序 |
|
* 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。 |
|
* 增加失败任务的重试机制 |
|
* 增加失败任务上报 |
|
* 增加任务处理超时 |
|
*/ |
|
import getConfigs from './config.mjs'; |
|
import common from './lib/common.mjs'; |
|
import TaskMoniter from "./lib/taskMoniter.mjs"; |
|
import TaJian from "./lib/tajian.mjs"; |
|
import HeroBot from "./lib/heroBot.mjs"; |
|
|
|
import Douyin from './bot/Douyin.mjs'; |
|
import Kuaishou from './bot/Kuaishou.mjs'; |
|
import Xigua from './bot/Xigua.mjs'; |
|
import Bilibili from './bot/Bilibili.mjs'; |
|
import WebCrawler from './bot/WebCrawler.mjs'; |
|
|
|
import cron from 'node-cron'; |
|
import path from 'node:path'; |
|
|
|
(async () => { |
|
//设置configs为全局变量 |
|
let configFile = ''; |
|
if (process.argv.length >= 3) { |
|
configFile = process.argv[2]; |
|
} |
|
global.configs = await getConfigs(configFile); |
|
|
|
const taskMoniter = new TaskMoniter(configs.task_list_dir); |
|
const tajian = new TaJian(configs.data_save_dir); |
|
|
|
taskMoniter.run(); //监控新任务 |
|
|
|
//HeroUnion英雄联盟对接 |
|
let heroUnionConfig = configs.herounion; |
|
let heroBot = new HeroBot( |
|
heroUnionConfig.server_url, |
|
heroUnionConfig.name, |
|
heroUnionConfig.description, |
|
heroUnionConfig.platforms, |
|
heroUnionConfig.contracts, |
|
heroUnionConfig.country, |
|
heroUnionConfig.lang, |
|
heroUnionConfig.contact, |
|
heroUnionConfig.data_mode |
|
); |
|
|
|
|
|
//配置本地cloud server地址,cloud安装参考:./install_cloud.sh |
|
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : ''; |
|
|
|
//spider run |
|
let spider_is_running = false, |
|
last_run_time = 0; |
|
const task_check_time = 20; //每 20 秒抓取一次 |
|
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { |
|
const current_time = common.getTimestampInSeconds(); |
|
|
|
//避免同时执行多个爬虫任务,并检查上个任务执行是否超时 |
|
if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) { |
|
return false; |
|
} |
|
|
|
//随机延迟一段时间,将不同爬虫的执行时间错开 |
|
let rnd_secods = parseInt(Math.random() * task_check_time); |
|
console.log("Sleep %s seconds before crap...", rnd_secods); |
|
await common.delay(rnd_secods); |
|
|
|
|
|
const task = taskMoniter.getNewTask(); |
|
if (!task) {return false;} |
|
|
|
//标记爬虫开始执行任务 |
|
spider_is_running = true; |
|
last_run_time = common.getTimestampInSeconds(); |
|
|
|
let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`; |
|
await common.saveLog(logFile, JSON.stringify(task) + "\n"); |
|
|
|
const botName = common.getBotName(task.url); |
|
console.log('New task %s handle by bot %s, url: %s, cloud server: %s', task.id, botName, task.url, heroCloudServer); |
|
let bot = null; |
|
switch (botName) { |
|
case 'douyin': |
|
bot = new Douyin(heroCloudServer); |
|
bot.setMode('mob'); //使用手机模式 |
|
break; |
|
case 'kuaishou': |
|
bot = new Kuaishou(heroCloudServer); |
|
break; |
|
case 'xigua': |
|
bot = new Xigua(heroCloudServer); |
|
break; |
|
case 'bilibili': |
|
bot = new Bilibili(heroCloudServer); |
|
break; |
|
|
|
default: |
|
bot = new WebCrawler(heroCloudServer, botName); |
|
break; |
|
} |
|
|
|
if (bot) { |
|
console.log('Spider craping...'); |
|
|
|
let taskStarted = taskMoniter.setTaskRunning(task.id); |
|
const data = await bot.scrap(task.url); |
|
//console.log('Data got by bot', data); |
|
|
|
if (typeof(data.done) != 'undefined' && data.done == true) { |
|
task.data = data; //把抓取到的数据保存到任务里 |
|
taskMoniter.updateTask(task.id, task); |
|
|
|
if ( |
|
await tajian.saveUrlShortcut(task.id, data) |
|
&& await tajian.saveDescriptionFiles(task.id, data) |
|
) { |
|
//马上回传一次数据 |
|
taskMoniter.notifyHandle(task); |
|
|
|
//标记任务完成 |
|
taskMoniter.setTaskDone(task.id); |
|
}else { |
|
taskMoniter.setTaskFailed(task.id); |
|
} |
|
}else { |
|
//失败后最多重试 5 次 |
|
if (typeof(task.fail_retry) == 'undefined') { |
|
task.fail_retry = 0; |
|
}else { |
|
task.fail_retry ++; |
|
} |
|
|
|
taskMoniter.updateTask(task.id, task); |
|
|
|
if (task.fail_retry > configs.max_fail_retry) { |
|
taskMoniter.setTaskFailed(task.id); |
|
|
|
//上报联盟,任务失败 |
|
heroBot.saveTaskData(task.id, task.token, [], 'failed'); |
|
}else { |
|
taskMoniter.setTaskWaiting(task.id); //重新进入等待处理状态 |
|
} |
|
} |
|
|
|
spider_is_running = false; |
|
}else { |
|
console.error('No bot matched with url %s', task.url); |
|
spider_is_running = false; |
|
taskMoniter.setTaskFailed(task.id); |
|
} |
|
}, { |
|
scheduled: false |
|
}); |
|
|
|
task_auto_run.start(); |
|
console.log('[%s] Spider started.', common.getTimeString()); |
|
|
|
|
|
//爬虫心跳上报 |
|
const heartBeatFrequence = 5; //5 分钟上报一次 |
|
const heroUnionHeartBeat = cron.schedule(`*/${heartBeatFrequence} * * * *`, async () => { |
|
//随机延迟一段时间,将不同爬虫的执行时间错开 |
|
let rnd_secods = parseInt(Math.random() * 60); |
|
console.log("Sleep %s seconds before send heart beat...", rnd_secods); |
|
await common.delay(rnd_secods); |
|
|
|
let status = spider_is_running ? 'busy' : 'idle'; |
|
const res = await heroBot.heartBeat(status); |
|
console.log('HeroUnion bot heart beat result', res); |
|
}, {scheduled: false}); |
|
heroUnionHeartBeat.start(); |
|
|
|
let heartBeatRes = await heroBot.heartBeat('idle'); //马上上报一次 |
|
console.log('[%s] HeroUnion bot heart beat started.', common.getTimeString(), heartBeatRes); |
|
})().catch(error => { |
|
console.error("Spider error got:\n%s", error); |
|
process.exit(1); |
|
});
|
|
|