Browse Source

spider ready, task moniter ready

master
filesite 1 year ago
parent
commit
58bff9943c
  1. 10
      bot/HeroBot.mjs
  2. 3
      config.mjs
  3. 18
      lib/common.mjs
  4. 1
      lib/tajian.mjs
  5. 36
      lib/taskMoniter.mjs
  6. 51
      spider.mjs
  7. 1
      test/scrap_test.mjs

10
bot/HeroBot.mjs

@ -2,6 +2,7 @@ import Hero from '@ulixee/hero'; @@ -2,6 +2,7 @@ import Hero from '@ulixee/hero';
import configs from '../config.mjs';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'url';
import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
class HeroBot {
@ -16,6 +17,9 @@ class HeroBot { @@ -16,6 +17,9 @@ class HeroBot {
};
this.name = '';
const __filename = fileURLToPath(import.meta.url);
this.root = path.dirname(__filename);
}
//返回profile对象
@ -38,13 +42,13 @@ class HeroBot { @@ -38,13 +42,13 @@ class HeroBot {
options.connectionToCore = this.heroServer;
}
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`);
const profilePath = path.resolve(this.root, '../tmp/', `profile_${botName}.json`);
if (fs.existsSync(profilePath) != false) {
const json = fs.readFileSync(profilePath, { encoding: 'utf8' });
return JSON.parse(json);
}
console.log('Hero init配置', configs);
//console.log('Hero init配置', configs);
const hero = new Hero(options);
hero.use(ClientLogPlugin); //开启log
@ -76,7 +80,7 @@ class HeroBot { @@ -76,7 +80,7 @@ class HeroBot {
try {
//保存profile
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`);
const profilePath = path.resolve(this.root, '../tmp/', `profile_${botName}.json`);
profile = this.fixCookies(profile);
fs.writeFileSync(profilePath, JSON.stringify(profile, null, 2));
}catch(error) {

3
config.mjs

@ -5,7 +5,8 @@ export default { @@ -5,7 +5,8 @@ export default {
//bot相关配置
userAgent: '~ chrome >= 114 && mac',
//userAgent: '~ chrome >= 114 && mac', //指定操作系统和浏览器版本
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: {
width: 1440,
height: 900

18
lib/common.mjs

@ -19,6 +19,22 @@ export default { @@ -19,6 +19,22 @@ export default {
}
return done;
}
},
getBotName: function(url) {
let botName = '';
if (/douyin\.com/ig.test(url)) {
botName = 'douyin';
}else if (/kuaishou\.com/ig.test(url)) {
botName = 'kuaishou';
}else if (/ixigua\.com/ig.test(url)) {
botName = 'xigua';
}else if (/b23\.tv/ig.test(url) || /bilibili\.com/ig.test(url)) {
botName = 'bilibili';
}
return botName;
},
};

1
lib/tajian.mjs

@ -10,6 +10,7 @@ class TaJian { @@ -10,6 +10,7 @@ class TaJian {
}
async saveUrlShortcut(data) {
console.log('TaJian try to save data', data);
}

36
lib/taskMoniter.mjs

@ -4,6 +4,10 @@ @@ -4,6 +4,10 @@
* 删除已完成的任务文件
* 内存中保存所有任务及其状态
* 返回当前任务状态
* -------------------
* 注意任务清单文件名不能重复如果一个新任务文件名跟已经处理过的任务重名则不会被处理
* -------------------
* task数据结构{id:'', url: '', status:''}
*/
import common from './common.mjs';
import fs from 'node:fs';
@ -40,6 +44,12 @@ class TaskMoniter { @@ -40,6 +44,12 @@ class TaskMoniter {
return `${dirPath}/${task_id}.task`;
}
//注意:任务文件名不能重复,已经用过的文件名不能再使用
//推荐以时间戳为任务文件名,如:1694762776985.task
getTaskId(filename) {
return filename.replace('.task', '');
}
getStatus() {
return this.taskStatus;
}
@ -47,9 +57,9 @@ class TaskMoniter { @@ -47,9 +57,9 @@ class TaskMoniter {
getNewTask() {
let task = null;
for (const item of this.tasks) {
if (item.status == this.statusCode.waiting) {
task = item;
for (const id in this.tasks) {
if (this.tasks[id].status == this.statusCode.waiting) {
task = this.tasks[id];
break;
}
}
@ -64,6 +74,7 @@ class TaskMoniter { @@ -64,6 +74,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.running;
this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --;
return true;
}
@ -75,6 +86,7 @@ class TaskMoniter { @@ -75,6 +86,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.done;
this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --;
const filepath = this.getTaskFilePath(task_id);
common.removeFile(filepath); //async delete
@ -88,6 +100,7 @@ class TaskMoniter { @@ -88,6 +100,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.failed;
this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --;
return true;
}
@ -96,7 +109,7 @@ class TaskMoniter { @@ -96,7 +109,7 @@ class TaskMoniter {
let task = {};
try {
task.id = filename.replace('.task', '');
task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting;
task.url = await readFile(filepath, { encoding: 'utf8' });
@ -134,11 +147,16 @@ class TaskMoniter { @@ -134,11 +147,16 @@ class TaskMoniter {
const dirPath = path.resolve(this.task_dir);
const files = await readdir(dirPath);
let task = null;
for (const file of files) {
if (file.indexOf('.task') === -1) {continue;} //ignore not *.task files
let task = null, task_id = null;
for (const filename of files) {
if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
task_id = this.getTaskId(filename);
if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
}
task = await this.parseTaskFile(file, `${dirPath}/${file}`);
task = await this.parseTaskFile(filename, `${dirPath}/${filename}`);
this.addTask(task);
}
@ -152,7 +170,7 @@ class TaskMoniter { @@ -152,7 +170,7 @@ class TaskMoniter {
run() { //开始监控任务目录,把所有任务缓存到内存
console.log('[%s] TaskMoniter started.', common.getTimeString());
//auto run
//auto check new tasks
const _self = this;
const task_check_time = this.check_time_gap;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {

51
spider.mjs

@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
* 负责监听任务目录里的新任务并自动抓取数据保存到数据目录
*/
import configs from './config.mjs';
import common from './lib/common.mjs';
import TaskMoniter from "./lib/taskMoniter.mjs";
import TaJian from "./lib/tajian.mjs";
@ -11,12 +12,60 @@ import Kuaishou from './bot/Kuaishou.mjs'; @@ -11,12 +12,60 @@ import Kuaishou from './bot/Kuaishou.mjs';
import Xigua from './bot/Xigua.mjs';
import Bilibili from './bot/Bilibili.mjs';
import cron from 'node-cron';
(async () => {
const taskMoniter = new TaskMoniter(configs.task_list_dir);
const tajian = new TaJian(configs.data_save_dir);
taskMoniter.run();
taskMoniter.run(); //监控新任务
const heroCloudServer = 'ws://192.168.3.13:1818';
//spider run
const task_check_time = 20; //每 20 秒抓取一次
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
const task = taskMoniter.getNewTask();
if (!task) {return false;}
const botName = common.getBotName(task.url);
console.log('New task %s handle by bot %s.', task.url, botName);
let bot = null;
switch (botName) {
case 'douyin':
bot = new Douyin(heroCloudServer);
break;
case 'kuaishou':
bot = new Kuaishou(heroCloudServer);
break;
case 'xigua':
bot = new Xigua(heroCloudServer);
break;
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
}
if (bot) {
taskMoniter.setTaskRunning(task.id);
const data = await bot.scrap(task.url);
console.log('Data got by bot', data);
if (typeof(data.done) != 'undefined' && data.done == true) {
taskMoniter.setTaskDone(task.id);
await tajian.saveUrlShortcut(data);
}
}else {
console.error('No bot matched with url %s', task.url);
}
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] Spider started.', common.getTimeString());
})().catch(error => {
console.error("Spider error got:\n%s", error);

1
test/scrap_test.mjs

@ -19,6 +19,7 @@ import configs from '../config.mjs'; @@ -19,6 +19,7 @@ import configs from '../config.mjs';
case 'douyin':
//抖音测试
url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc
url = 'https://v.douyin.com/i8sEyb6/'; //mob and pc
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时

Loading…
Cancel
Save