Browse Source

spider ready, task moniter ready

master
filesite 1 year ago
parent
commit
58bff9943c
  1. 10
      bot/HeroBot.mjs
  2. 3
      config.mjs
  3. 16
      lib/common.mjs
  4. 1
      lib/tajian.mjs
  5. 36
      lib/taskMoniter.mjs
  6. 51
      spider.mjs
  7. 1
      test/scrap_test.mjs

10
bot/HeroBot.mjs

@ -2,6 +2,7 @@ import Hero from '@ulixee/hero';
import configs from '../config.mjs'; import configs from '../config.mjs';
import fs from 'node:fs'; import fs from 'node:fs';
import path from 'node:path'; import path from 'node:path';
import { fileURLToPath } from 'url';
import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs'; import ClientLogPlugin from '../plugin/ClientLogPlugin.mjs';
class HeroBot { class HeroBot {
@ -16,6 +17,9 @@ class HeroBot {
}; };
this.name = ''; this.name = '';
const __filename = fileURLToPath(import.meta.url);
this.root = path.dirname(__filename);
} }
//返回profile对象 //返回profile对象
@ -38,13 +42,13 @@ class HeroBot {
options.connectionToCore = this.heroServer; options.connectionToCore = this.heroServer;
} }
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`); const profilePath = path.resolve(this.root, '../tmp/', `profile_${botName}.json`);
if (fs.existsSync(profilePath) != false) { if (fs.existsSync(profilePath) != false) {
const json = fs.readFileSync(profilePath, { encoding: 'utf8' }); const json = fs.readFileSync(profilePath, { encoding: 'utf8' });
return JSON.parse(json); return JSON.parse(json);
} }
console.log('Hero init配置', configs); //console.log('Hero init配置', configs);
const hero = new Hero(options); const hero = new Hero(options);
hero.use(ClientLogPlugin); //开启log hero.use(ClientLogPlugin); //开启log
@ -76,7 +80,7 @@ class HeroBot {
try { try {
//保存profile //保存profile
const profilePath = path.resolve('../tmp/', `profile_${botName}.json`); const profilePath = path.resolve(this.root, '../tmp/', `profile_${botName}.json`);
profile = this.fixCookies(profile); profile = this.fixCookies(profile);
fs.writeFileSync(profilePath, JSON.stringify(profile, null, 2)); fs.writeFileSync(profilePath, JSON.stringify(profile, null, 2));
}catch(error) { }catch(error) {

3
config.mjs

@ -5,7 +5,8 @@ export default {
//bot相关配置 //bot相关配置
userAgent: '~ chrome >= 114 && mac', //userAgent: '~ chrome >= 114 && mac', //指定操作系统和浏览器版本
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: { viewport: {
width: 1440, width: 1440,
height: 900 height: 900

16
lib/common.mjs

@ -19,6 +19,22 @@ export default {
} }
return done; return done;
},
getBotName: function(url) {
let botName = '';
if (/douyin\.com/ig.test(url)) {
botName = 'douyin';
}else if (/kuaishou\.com/ig.test(url)) {
botName = 'kuaishou';
}else if (/ixigua\.com/ig.test(url)) {
botName = 'xigua';
}else if (/b23\.tv/ig.test(url) || /bilibili\.com/ig.test(url)) {
botName = 'bilibili';
} }
return botName;
},
}; };

1
lib/tajian.mjs

@ -10,6 +10,7 @@ class TaJian {
} }
async saveUrlShortcut(data) { async saveUrlShortcut(data) {
console.log('TaJian try to save data', data);
} }

36
lib/taskMoniter.mjs

@ -4,6 +4,10 @@
* 删除已完成的任务文件 * 删除已完成的任务文件
* 内存中保存所有任务及其状态 * 内存中保存所有任务及其状态
* 返回当前任务状态 * 返回当前任务状态
* -------------------
* 注意任务清单文件名不能重复如果一个新任务文件名跟已经处理过的任务重名则不会被处理
* -------------------
* task数据结构{id:'', url: '', status:''}
*/ */
import common from './common.mjs'; import common from './common.mjs';
import fs from 'node:fs'; import fs from 'node:fs';
@ -40,6 +44,12 @@ class TaskMoniter {
return `${dirPath}/${task_id}.task`; return `${dirPath}/${task_id}.task`;
} }
//注意:任务文件名不能重复,已经用过的文件名不能再使用
//推荐以时间戳为任务文件名,如:1694762776985.task
getTaskId(filename) {
return filename.replace('.task', '');
}
getStatus() { getStatus() {
return this.taskStatus; return this.taskStatus;
} }
@ -47,9 +57,9 @@ class TaskMoniter {
getNewTask() { getNewTask() {
let task = null; let task = null;
for (const item of this.tasks) { for (const id in this.tasks) {
if (item.status == this.statusCode.waiting) { if (this.tasks[id].status == this.statusCode.waiting) {
task = item; task = this.tasks[id];
break; break;
} }
} }
@ -64,6 +74,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.running; this.tasks[task_id].status = this.statusCode.running;
this.taskStatus[this.statusCode.running] ++; this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --;
return true; return true;
} }
@ -75,6 +86,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.done; this.tasks[task_id].status = this.statusCode.done;
this.taskStatus[this.statusCode.done] ++; this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --;
const filepath = this.getTaskFilePath(task_id); const filepath = this.getTaskFilePath(task_id);
common.removeFile(filepath); //async delete common.removeFile(filepath); //async delete
@ -88,6 +100,7 @@ class TaskMoniter {
this.tasks[task_id].status = this.statusCode.failed; this.tasks[task_id].status = this.statusCode.failed;
this.taskStatus[this.statusCode.failed] ++; this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --;
return true; return true;
} }
@ -96,7 +109,7 @@ class TaskMoniter {
let task = {}; let task = {};
try { try {
task.id = filename.replace('.task', ''); task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting; task.status = this.statusCode.waiting;
task.url = await readFile(filepath, { encoding: 'utf8' }); task.url = await readFile(filepath, { encoding: 'utf8' });
@ -134,11 +147,16 @@ class TaskMoniter {
const dirPath = path.resolve(this.task_dir); const dirPath = path.resolve(this.task_dir);
const files = await readdir(dirPath); const files = await readdir(dirPath);
let task = null; let task = null, task_id = null;
for (const file of files) { for (const filename of files) {
if (file.indexOf('.task') === -1) {continue;} //ignore not *.task files if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
task_id = this.getTaskId(filename);
if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
}
task = await this.parseTaskFile(file, `${dirPath}/${file}`); task = await this.parseTaskFile(filename, `${dirPath}/${filename}`);
this.addTask(task); this.addTask(task);
} }
@ -152,7 +170,7 @@ class TaskMoniter {
run() { //开始监控任务目录,把所有任务缓存到内存 run() { //开始监控任务目录,把所有任务缓存到内存
console.log('[%s] TaskMoniter started.', common.getTimeString()); console.log('[%s] TaskMoniter started.', common.getTimeString());
//auto run //auto check new tasks
const _self = this; const _self = this;
const task_check_time = this.check_time_gap; const task_check_time = this.check_time_gap;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {

51
spider.mjs

@ -3,6 +3,7 @@
* 负责监听任务目录里的新任务并自动抓取数据保存到数据目录 * 负责监听任务目录里的新任务并自动抓取数据保存到数据目录
*/ */
import configs from './config.mjs'; import configs from './config.mjs';
import common from './lib/common.mjs';
import TaskMoniter from "./lib/taskMoniter.mjs"; import TaskMoniter from "./lib/taskMoniter.mjs";
import TaJian from "./lib/tajian.mjs"; import TaJian from "./lib/tajian.mjs";
@ -11,12 +12,60 @@ import Kuaishou from './bot/Kuaishou.mjs';
import Xigua from './bot/Xigua.mjs'; import Xigua from './bot/Xigua.mjs';
import Bilibili from './bot/Bilibili.mjs'; import Bilibili from './bot/Bilibili.mjs';
import cron from 'node-cron';
(async () => { (async () => {
const taskMoniter = new TaskMoniter(configs.task_list_dir); const taskMoniter = new TaskMoniter(configs.task_list_dir);
const tajian = new TaJian(configs.data_save_dir); const tajian = new TaJian(configs.data_save_dir);
taskMoniter.run(); taskMoniter.run(); //监控新任务
const heroCloudServer = 'ws://192.168.3.13:1818';
//spider run
const task_check_time = 20; //每 20 秒抓取一次
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
const task = taskMoniter.getNewTask();
if (!task) {return false;}
const botName = common.getBotName(task.url);
console.log('New task %s handle by bot %s.', task.url, botName);
let bot = null;
switch (botName) {
case 'douyin':
bot = new Douyin(heroCloudServer);
break;
case 'kuaishou':
bot = new Kuaishou(heroCloudServer);
break;
case 'xigua':
bot = new Xigua(heroCloudServer);
break;
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
}
if (bot) {
taskMoniter.setTaskRunning(task.id);
const data = await bot.scrap(task.url);
console.log('Data got by bot', data);
if (typeof(data.done) != 'undefined' && data.done == true) {
taskMoniter.setTaskDone(task.id);
await tajian.saveUrlShortcut(data);
}
}else {
console.error('No bot matched with url %s', task.url);
}
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] Spider started.', common.getTimeString());
})().catch(error => { })().catch(error => {
console.error("Spider error got:\n%s", error); console.error("Spider error got:\n%s", error);

1
test/scrap_test.mjs

@ -19,6 +19,7 @@ import configs from '../config.mjs';
case 'douyin': case 'douyin':
//抖音测试 //抖音测试
url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc url = 'https://v.douyin.com/ieUpFCva/'; //mob and pc
url = 'https://v.douyin.com/i8sEyb6/'; //mob and pc
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时 configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时

Loading…
Cancel
Save