Browse Source

code reindent, add config for herounion

master
filesite 8 months ago
parent
commit
87754b28a0
  1. 20
      README.md
  2. 92
      config.mjs
  3. 265
      lib/taskMoniter.mjs
  4. 20
      package.json
  5. 58
      spider.mjs
  6. 18
      test/start_cloud.mjs
  7. 16
      test/tajia_test.mjs

20
README.md

@ -37,13 +37,14 @@ Hero scripts of machete.
## 使用方法 ## 使用方法
1. 下载本源码到本地后,进入项目根目录 1. 下载本源码到本地后,进入项目根目录
``` ```
git clone "https://git.filesite.io/filesite/machete_hero.git" git clone "https://git.filesite.io/filesite/machete_hero.git"
cd machete_hero/ cd machete_hero/
``` ```
2. 执行下面命令安装依赖包:
2. 执行下面命令安装依赖包
``` ```
npm install npm install
``` ```
@ -51,8 +52,19 @@ npm install
如果你对npm和node不熟悉,请自行了解。 如果你对npm和node不熟悉,请自行了解。
3. 写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。 3. 执行下面命令启动爬虫
```
npm start
```
在目录todo/里创建任务文件,爬虫检测到新任务后自动抓取数据并保存到data/目录下。
4. 二次开发
写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。
还可以参考bot/下的类库,实现对任意网站的数据抓取。
bot/目录下的类库调用方法,可参考test/scrap_test.mjs测试脚本, bot/目录下的类库调用方法,可参考test/scrap_test.mjs测试脚本,
测试脚本使用方法见test/README.md文档。 测试脚本使用方法见test/README.md文档。

92
config.mjs

@ -1,45 +1,57 @@
export default { export default {
//自动任务相关配置 //自动任务相关配置
task_list_dir: 'todo/', //待抓取任务文件保存目录 task_list_dir: 'todo/', //待抓取任务文件保存目录
data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io
//bot相关配置 //herounion对接配置
//userAgent: '~ chrome >= 114 && mac', //指定操作系统和浏览器版本 herounion: {
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', name: 'machete_hero', //爬虫名字
viewport: { description: '支持Machete的TaJian皮肤的hero爬虫', //爬虫简介
width: 1440, platforms: 'douyin,kuaishou,xigua,bilibili', //爬虫支持的平台
height: 900 contracts: 'tajiantv', //爬虫支持的数据采集合约(可二次开发自定义)
}, country: 'cn', //爬虫所在国家
lang: 'zh', //爬虫支持的语言
viewports: { contact: 'https://filesite.io', //爬虫的联系方式
mob: {
width: 375,
height: 667
}, },
pc: {
//bot相关配置
cloud_server: 'ws://192.168.3.13:1818',
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: {
width: 1440, width: 1440,
height: 900 height: 900
}, },
},
viewports: {
//请求参数 mob: {
heroBotOptions: { width: 375,
timeoutMs: 20000, height: 667
referrer: '', },
}, pc: {
width: 1440,
//网页tab参数 height: 900
heroTabOptions: { },
timeoutMs: 30000 },
},
//请求参数
//常用浏览器user-agent heroBotOptions: {
userAgents: { timeoutMs: 20000,
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', referrer: '',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN', },
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN', //网页tab参数
} heroTabOptions: {
timeoutMs: 30000
}; },
//常用浏览器user-agent
userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
}
};

265
lib/taskMoniter.mjs

@ -16,175 +16,172 @@ import path from 'node:path';
import cron from 'node-cron'; import cron from 'node-cron';
class TaskMoniter { class TaskMoniter {
constructor(task_list_dir) {
this.check_time_gap = 30; //检测间隔时间,单位:秒
this.checking = false;
constructor(task_list_dir) { this.task_dir = task_list_dir; //监控目录:任务列表保存目录
this.check_time_gap = 30; //检测间隔时间,单位:秒 this.tasks = {}; //内存中的任务列表
this.checking = false; this.taskStatus = { //当前任务状态
total: 0, //总任务数
this.task_dir = task_list_dir; //监控目录:任务列表保存目录 waiting: 0, //等待执行的任务数
this.tasks = {}; //内存中的任务列表 running: 0, //正在执行的任务数
this.taskStatus = { //当前任务状态 done: 0, //已完成的任务数
total: 0, //总任务数 failed: 0 //执行失败的任务数
waiting: 0, //等待执行的任务数 };
running: 0, //正在执行的任务数
done: 0, //已完成的任务数 this.statusCode = {
failed: 0 //执行失败的任务数 waiting: 'waiting',
}; running: 'running',
done: 'done',
this.statusCode = { failed: 'failed',
waiting: 'waiting', };
running: 'running',
done: 'done',
failed: 'failed',
};
}
getTaskFilePath(task_id) {
const dirPath = path.resolve(this.task_dir);
return `${dirPath}/${task_id}.task`;
}
//注意:任务文件名不能重复,已经用过的文件名不能再使用
//推荐以时间戳为任务文件名,如:1694762776985.task
getTaskId(filename) {
return filename.replace('.task', '');
}
getStatus() {
return this.taskStatus;
}
getNewTask() {
let task = null;
for (const id in this.tasks) {
if (this.tasks[id].status == this.statusCode.waiting) {
task = this.tasks[id];
break;
}
} }
return task; getTaskFilePath(task_id) {
} const dirPath = path.resolve(this.task_dir);
return `${dirPath}/${task_id}.task`;
setTaskRunning(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
} }
this.tasks[task_id].status = this.statusCode.running; //注意:任务文件名不能重复,已经用过的文件名不能再使用
this.taskStatus[this.statusCode.running] ++; //推荐以时间戳为任务文件名,如:1694762776985.task
this.taskStatus[this.statusCode.waiting] --; getTaskId(filename) {
return filename.replace('.task', '');
return true; }
}
setTaskDone(task_id) { getStatus() {
if (typeof(this.tasks[task_id]) == 'undefined') { return this.taskStatus;
return false;
} }
this.tasks[task_id].status = this.statusCode.done; getNewTask() {
this.taskStatus[this.statusCode.done] ++; let task = null;
this.taskStatus[this.statusCode.running] --;
const filepath = this.getTaskFilePath(task_id); for (const id in this.tasks) {
common.removeFile(filepath); //async delete if (this.tasks[id].status == this.statusCode.waiting) {
return true; task = this.tasks[id];
} break;
}
}
setTaskFailed(task_id) { return task;
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
} }
this.tasks[task_id].status = this.statusCode.failed; setTaskRunning(task_id) {
this.taskStatus[this.statusCode.failed] ++; if (typeof(this.tasks[task_id]) == 'undefined') {
this.taskStatus[this.statusCode.running] --; return false;
}
return true;
}
async parseTaskFile(filename, filepath) { this.tasks[task_id].status = this.statusCode.running;
let task = {}; this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --;
try { return true;
task.id = this.getTaskId(filename); }
task.status = this.statusCode.waiting;
task.url = await readFile(filepath, { encoding: 'utf8' }); setTaskDone(task_id) {
if (task.url) { if (typeof(this.tasks[task_id]) == 'undefined') {
task.url = task.url.replace(/[\r\n]/g, ''); return false;
} }
}catch(error) {
console.error('Get task file content failed: %s', error);
}
return task; this.tasks[task_id].status = this.statusCode.done;
} this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --;
addTask(task) { const filepath = this.getTaskFilePath(task_id);
if (typeof(this.tasks[task.id]) != 'undefined') { common.removeFile(filepath); //async delete
return false; return true;
} }
this.tasks[task.id] = task; setTaskFailed(task_id) {
this.taskStatus[task.status] ++; if (typeof(this.tasks[task_id]) == 'undefined') {
this.taskStatus.total ++; return false;
}
return true; this.tasks[task_id].status = this.statusCode.failed;
} this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --;
async checkTasks() { return true;
if (this.checking == true) {
return;
} }
try { async parseTaskFile(filename, filepath) {
console.log('[%s] TaskMoniter auto check...', common.getTimeString()); let task = {};
this.checking = true; try {
task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting;
const dirPath = path.resolve(this.task_dir); task.url = await readFile(filepath, { encoding: 'utf8' });
const files = await readdir(dirPath); if (task.url) {
let task = null, task_id = null; task.url = task.url.replace(/[\r\n]/g, '');
for (const filename of files) {
if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
task_id = this.getTaskId(filename);
if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
} }
}catch(error) {
console.error('Get task file content failed: %s', error);
}
return task;
}
task = await this.parseTaskFile(filename, `${dirPath}/${filename}`); addTask(task) {
this.addTask(task); if (typeof(this.tasks[task.id]) != 'undefined') {
return false;
} }
this.checking = false; this.tasks[task.id] = task;
}catch(error) { this.taskStatus[task.status] ++;
this.checking = false; this.taskStatus.total ++;
console.error('Check tasks failed: %s', error);
return true;
} }
}
run() { //开始监控任务目录,把所有任务缓存到内存 async checkTasks() {
console.log('[%s] TaskMoniter started.', common.getTimeString()); if (this.checking == true) {
return;
}
try {
console.log('[%s] TaskMoniter auto check...', common.getTimeString());
this.checking = true;
//auto check new tasks const dirPath = path.resolve(this.task_dir);
const _self = this; const files = await readdir(dirPath);
const task_check_time = this.check_time_gap; let task = null, task_id = null;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => { for (const filename of files) {
await _self.checkTasks(); if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
console.log('Status', _self.getStatus());
}, {
scheduled: false
});
task_auto_run.start(); task_id = this.getTaskId(filename);
console.log('[%s] TaskMoniter auto check started.', common.getTimeString()); if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
}
} task = await this.parseTaskFile(filename, `${dirPath}/${filename}`);
this.addTask(task);
}
this.checking = false;
}catch(error) {
this.checking = false;
console.error('Check tasks failed: %s', error);
}
}
run() { //开始监控任务目录,把所有任务缓存到内存
console.log('[%s] TaskMoniter started.', common.getTimeString());
//auto check new tasks
const _self = this;
const task_check_time = this.check_time_gap;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
await _self.checkTasks();
console.log('Status', _self.getStatus());
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] TaskMoniter auto check started.', common.getTimeString());
}
} }
export default TaskMoniter; export default TaskMoniter;

20
package.json

@ -1,9 +1,13 @@
{ {
"name": "machege-hero", "name": "machege-hero",
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"@ulixee/cloud": "^2.0.0-alpha.24", "@ulixee/cloud": "^2.0.0-alpha.24",
"@ulixee/hero": "^2.0.0-alpha.24", "@ulixee/hero": "^2.0.0-alpha.24",
"node-cron": "^3.0.2" "node-cron": "^3.0.2",
} "axios": "^1.3.3"
} },
"scripts": {
"start": "node spider.mjs"
}
}

58
spider.mjs

@ -21,8 +21,8 @@ import cron from 'node-cron';
taskMoniter.run(); //监控新任务 taskMoniter.run(); //监控新任务
//配置本地cloud server地址,cloud安装参考:./install_cloud.sh
const heroCloudServer = 'ws://192.168.3.13:1818'; const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : '';
//spider run //spider run
let spider_is_running = false; let spider_is_running = false;
@ -37,18 +37,18 @@ import cron from 'node-cron';
console.log('New task %s handle by bot %s.', task.url, botName); console.log('New task %s handle by bot %s.', task.url, botName);
let bot = null; let bot = null;
switch (botName) { switch (botName) {
case 'douyin': case 'douyin':
bot = new Douyin(heroCloudServer); bot = new Douyin(heroCloudServer);
break; break;
case 'kuaishou': case 'kuaishou':
bot = new Kuaishou(heroCloudServer); bot = new Kuaishou(heroCloudServer);
break; break;
case 'xigua': case 'xigua':
bot = new Xigua(heroCloudServer); bot = new Xigua(heroCloudServer);
break; break;
case 'bilibili': case 'bilibili':
bot = new Bilibili(heroCloudServer); bot = new Bilibili(heroCloudServer);
break; break;
} }
if (bot) { if (bot) {
@ -62,29 +62,31 @@ import cron from 'node-cron';
if ( if (
await tajian.saveUrlShortcut(task.id, data) await tajian.saveUrlShortcut(task.id, data)
&& await tajian.saveDescriptionFiles(task.id, data) && await tajian.saveDescriptionFiles(task.id, data)
) { ) {
taskMoniter.setTaskDone(task.id); taskMoniter.setTaskDone(task.id);
}else {
taskMoniter.setTaskFailed(task.id);
}
}else { }else {
taskMoniter.setTaskFailed(task.id); taskMoniter.setTaskFailed(task.id);
} }
spider_is_running = false;
}else { }else {
console.error('No bot matched with url %s', task.url);
taskMoniter.setTaskRunning(task.id);
taskMoniter.setTaskFailed(task.id); taskMoniter.setTaskFailed(task.id);
} }
}, {
scheduled: false spider_is_running = false;
}); }else {
console.error('No bot matched with url %s', task.url);
taskMoniter.setTaskRunning(task.id);
taskMoniter.setTaskFailed(task.id);
}
}, {
scheduled: false
});
task_auto_run.start(); task_auto_run.start();
console.log('[%s] Spider started.', common.getTimeString()); console.log('[%s] Spider started.', common.getTimeString());
//TODO: 对接英雄联盟接口:https://herounion.filesite.io
})().catch(error => { })().catch(error => {
console.error("Spider error got:\n%s", error); console.error("Spider error got:\n%s", error);
process.exit(1); process.exit(1);
}); });

18
test/start_cloud.mjs

@ -2,13 +2,13 @@
import { CloudNode } from '@ulixee/cloud'; import { CloudNode } from '@ulixee/cloud';
(async () => { (async () => {
const cloudNode = new CloudNode({ const cloudNode = new CloudNode({
port: 1818, port: 1818,
cloudType: 'private' cloudType: 'private'
}); });
await cloudNode.listen(); await cloudNode.listen();
console.log(`CloudNode started on port ${await cloudNode.port}`); console.log(`CloudNode started on port ${await cloudNode.port}`);
})().catch(error => { })().catch(error => {
console.log('ERROR starting Ulixee CloudNode', error); console.log('ERROR starting Ulixee CloudNode', error);
process.exit(1); process.exit(1);
}); });

16
test/tajia_test.mjs

@ -3,11 +3,11 @@ import TaJian from '../lib/tajian.mjs';
(async () => { (async () => {
const data = { const data = {
url: 'https://v.douyin.com/i8sEyb6/', url: 'https://v.douyin.com/i8sEyb6/',
done: true, done: true,
bot: 'douyin', bot: 'douyin',
title: '自由与成功 - 抖音', title: '自由与成功 - 抖音',
cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D' cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D'
}; };
const filename = 'douyintest'; const filename = 'douyintest';
@ -20,6 +20,6 @@ import TaJian from '../lib/tajian.mjs';
console.log('descriptions save done', saveDescDone); console.log('descriptions save done', saveDescDone);
})().catch(error => { })().catch(error => {
console.error("Error got:\n%s", error); console.error("Error got:\n%s", error);
process.exit(1); process.exit(1);
}); });
Loading…
Cancel
Save