Browse Source

code reindent, add config for herounion

master
filesite 8 months ago
parent
commit
87754b28a0
  1. 20
      README.md
  2. 92
      config.mjs
  3. 265
      lib/taskMoniter.mjs
  4. 20
      package.json
  5. 58
      spider.mjs
  6. 18
      test/start_cloud.mjs
  7. 16
      test/tajia_test.mjs

20
README.md

@ -37,13 +37,14 @@ Hero scripts of machete. @@ -37,13 +37,14 @@ Hero scripts of machete.
## 使用方法
1. 下载本源码到本地后,进入项目根目录
1. 下载本源码到本地后,进入项目根目录
```
git clone "https://git.filesite.io/filesite/machete_hero.git"
cd machete_hero/
```
2. 执行下面命令安装依赖包:
2. 执行下面命令安装依赖包
```
npm install
```
@ -51,8 +52,19 @@ npm install @@ -51,8 +52,19 @@ npm install
如果你对npm和node不熟悉,请自行了解。
3. 写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。
3. 执行下面命令启动爬虫
```
npm start
```
在目录todo/里创建任务文件,爬虫检测到新任务后自动抓取数据并保存到data/目录下。
4. 二次开发
写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。
还可以参考bot/下的类库,实现对任意网站的数据抓取。
bot/目录下的类库调用方法,可参考test/scrap_test.mjs测试脚本,
测试脚本使用方法见test/README.md文档。

92
config.mjs

@ -1,45 +1,57 @@ @@ -1,45 +1,57 @@
export default {
//自动任务相关配置
task_list_dir: 'todo/', //待抓取任务文件保存目录
data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io
//bot相关配置
//userAgent: '~ chrome >= 114 && mac', //指定操作系统和浏览器版本
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: {
width: 1440,
height: 900
},
viewports: {
mob: {
width: 375,
height: 667
//自动任务相关配置
task_list_dir: 'todo/', //待抓取任务文件保存目录
data_save_dir: 'data/', //抓取完成数据保存目录,文件格式:.url快捷方式,详细说明见:https://filesite.io
//herounion对接配置
herounion: {
name: 'machete_hero', //爬虫名字
description: '支持Machete的TaJian皮肤的hero爬虫', //爬虫简介
platforms: 'douyin,kuaishou,xigua,bilibili', //爬虫支持的平台
contracts: 'tajiantv', //爬虫支持的数据采集合约(可二次开发自定义)
country: 'cn', //爬虫所在国家
lang: 'zh', //爬虫支持的语言
contact: 'https://filesite.io', //爬虫的联系方式
},
pc: {
//bot相关配置
cloud_server: 'ws://192.168.3.13:1818',
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: {
width: 1440,
height: 900
},
},
//请求参数
heroBotOptions: {
timeoutMs: 20000,
referrer: '',
},
//网页tab参数
heroTabOptions: {
timeoutMs: 30000
},
//常用浏览器user-agent
userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
}
};
viewports: {
mob: {
width: 375,
height: 667
},
pc: {
width: 1440,
height: 900
},
},
//请求参数
heroBotOptions: {
timeoutMs: 20000,
referrer: '',
},
//网页tab参数
heroTabOptions: {
timeoutMs: 30000
},
//常用浏览器user-agent
userAgents: {
iphone_chrome: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
iphone_wechat: 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C202 MicroMessenger/6.6.1 NetType/4G Language/zh_CN',
mac_chrome: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
android_wechat: 'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
}
};

265
lib/taskMoniter.mjs

@ -16,175 +16,172 @@ import path from 'node:path'; @@ -16,175 +16,172 @@ import path from 'node:path';
import cron from 'node-cron';
class TaskMoniter {
constructor(task_list_dir) {
this.check_time_gap = 30; //检测间隔时间,单位:秒
this.checking = false;
constructor(task_list_dir) {
this.check_time_gap = 30; //检测间隔时间,单位:秒
this.checking = false;
this.task_dir = task_list_dir; //监控目录:任务列表保存目录
this.tasks = {}; //内存中的任务列表
this.taskStatus = { //当前任务状态
total: 0, //总任务数
waiting: 0, //等待执行的任务数
running: 0, //正在执行的任务数
done: 0, //已完成的任务数
failed: 0 //执行失败的任务数
};
this.statusCode = {
waiting: 'waiting',
running: 'running',
done: 'done',
failed: 'failed',
};
}
getTaskFilePath(task_id) {
const dirPath = path.resolve(this.task_dir);
return `${dirPath}/${task_id}.task`;
}
//注意:任务文件名不能重复,已经用过的文件名不能再使用
//推荐以时间戳为任务文件名,如:1694762776985.task
getTaskId(filename) {
return filename.replace('.task', '');
}
getStatus() {
return this.taskStatus;
}
getNewTask() {
let task = null;
for (const id in this.tasks) {
if (this.tasks[id].status == this.statusCode.waiting) {
task = this.tasks[id];
break;
}
this.task_dir = task_list_dir; //监控目录:任务列表保存目录
this.tasks = {}; //内存中的任务列表
this.taskStatus = { //当前任务状态
total: 0, //总任务数
waiting: 0, //等待执行的任务数
running: 0, //正在执行的任务数
done: 0, //已完成的任务数
failed: 0 //执行失败的任务数
};
this.statusCode = {
waiting: 'waiting',
running: 'running',
done: 'done',
failed: 'failed',
};
}
return task;
}
setTaskRunning(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
getTaskFilePath(task_id) {
const dirPath = path.resolve(this.task_dir);
return `${dirPath}/${task_id}.task`;
}
this.tasks[task_id].status = this.statusCode.running;
this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --;
return true;
}
//注意:任务文件名不能重复,已经用过的文件名不能再使用
//推荐以时间戳为任务文件名,如:1694762776985.task
getTaskId(filename) {
return filename.replace('.task', '');
}
setTaskDone(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
getStatus() {
return this.taskStatus;
}
this.tasks[task_id].status = this.statusCode.done;
this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --;
getNewTask() {
let task = null;
const filepath = this.getTaskFilePath(task_id);
common.removeFile(filepath); //async delete
return true;
}
for (const id in this.tasks) {
if (this.tasks[id].status == this.statusCode.waiting) {
task = this.tasks[id];
break;
}
}
setTaskFailed(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
return task;
}
this.tasks[task_id].status = this.statusCode.failed;
this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --;
return true;
}
setTaskRunning(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
}
async parseTaskFile(filename, filepath) {
let task = {};
this.tasks[task_id].status = this.statusCode.running;
this.taskStatus[this.statusCode.running] ++;
this.taskStatus[this.statusCode.waiting] --;
try {
task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting;
return true;
}
task.url = await readFile(filepath, { encoding: 'utf8' });
if (task.url) {
task.url = task.url.replace(/[\r\n]/g, '');
setTaskDone(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
}
}catch(error) {
console.error('Get task file content failed: %s', error);
}
return task;
}
this.tasks[task_id].status = this.statusCode.done;
this.taskStatus[this.statusCode.done] ++;
this.taskStatus[this.statusCode.running] --;
addTask(task) {
if (typeof(this.tasks[task.id]) != 'undefined') {
return false;
const filepath = this.getTaskFilePath(task_id);
common.removeFile(filepath); //async delete
return true;
}
this.tasks[task.id] = task;
this.taskStatus[task.status] ++;
this.taskStatus.total ++;
setTaskFailed(task_id) {
if (typeof(this.tasks[task_id]) == 'undefined') {
return false;
}
return true;
}
this.tasks[task_id].status = this.statusCode.failed;
this.taskStatus[this.statusCode.failed] ++;
this.taskStatus[this.statusCode.running] --;
async checkTasks() {
if (this.checking == true) {
return;
return true;
}
try {
console.log('[%s] TaskMoniter auto check...', common.getTimeString());
async parseTaskFile(filename, filepath) {
let task = {};
this.checking = true;
try {
task.id = this.getTaskId(filename);
task.status = this.statusCode.waiting;
const dirPath = path.resolve(this.task_dir);
const files = await readdir(dirPath);
let task = null, task_id = null;
for (const filename of files) {
if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
task_id = this.getTaskId(filename);
if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
task.url = await readFile(filepath, { encoding: 'utf8' });
if (task.url) {
task.url = task.url.replace(/[\r\n]/g, '');
}
}catch(error) {
console.error('Get task file content failed: %s', error);
}
return task;
}
task = await this.parseTaskFile(filename, `${dirPath}/${filename}`);
this.addTask(task);
addTask(task) {
if (typeof(this.tasks[task.id]) != 'undefined') {
return false;
}
this.checking = false;
}catch(error) {
this.checking = false;
console.error('Check tasks failed: %s', error);
this.tasks[task.id] = task;
this.taskStatus[task.status] ++;
this.taskStatus.total ++;
return true;
}
}
run() { //开始监控任务目录,把所有任务缓存到内存
console.log('[%s] TaskMoniter started.', common.getTimeString());
async checkTasks() {
if (this.checking == true) {
return;
}
try {
console.log('[%s] TaskMoniter auto check...', common.getTimeString());
this.checking = true;
//auto check new tasks
const _self = this;
const task_check_time = this.check_time_gap;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
await _self.checkTasks();
console.log('Status', _self.getStatus());
}, {
scheduled: false
});
const dirPath = path.resolve(this.task_dir);
const files = await readdir(dirPath);
let task = null, task_id = null;
for (const filename of files) {
if (filename.indexOf('.task') === -1) {continue;} //ignore not *.task files
task_auto_run.start();
console.log('[%s] TaskMoniter auto check started.', common.getTimeString());
task_id = this.getTaskId(filename);
if (typeof(this.tasks[task_id]) != 'undefined') { //跳过已经存在的任务
continue;
}
}
task = await this.parseTaskFile(filename, `${dirPath}/${filename}`);
this.addTask(task);
}
this.checking = false;
}catch(error) {
this.checking = false;
console.error('Check tasks failed: %s', error);
}
}
run() { //开始监控任务目录,把所有任务缓存到内存
console.log('[%s] TaskMoniter started.', common.getTimeString());
//auto check new tasks
const _self = this;
const task_check_time = this.check_time_gap;
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
await _self.checkTasks();
console.log('Status', _self.getStatus());
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] TaskMoniter auto check started.', common.getTimeString());
}
}
export default TaskMoniter;
export default TaskMoniter;

20
package.json

@ -1,9 +1,13 @@ @@ -1,9 +1,13 @@
{
"name": "machege-hero",
"type": "module",
"dependencies": {
"@ulixee/cloud": "^2.0.0-alpha.24",
"@ulixee/hero": "^2.0.0-alpha.24",
"node-cron": "^3.0.2"
}
}
"name": "machege-hero",
"type": "module",
"dependencies": {
"@ulixee/cloud": "^2.0.0-alpha.24",
"@ulixee/hero": "^2.0.0-alpha.24",
"node-cron": "^3.0.2",
"axios": "^1.3.3"
},
"scripts": {
"start": "node spider.mjs"
}
}

58
spider.mjs

@ -21,8 +21,8 @@ import cron from 'node-cron'; @@ -21,8 +21,8 @@ import cron from 'node-cron';
taskMoniter.run(); //监控新任务
const heroCloudServer = 'ws://192.168.3.13:1818';
//配置本地cloud server地址,cloud安装参考:./install_cloud.sh
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : '';
//spider run
let spider_is_running = false;
@ -37,18 +37,18 @@ import cron from 'node-cron'; @@ -37,18 +37,18 @@ import cron from 'node-cron';
console.log('New task %s handle by bot %s.', task.url, botName);
let bot = null;
switch (botName) {
case 'douyin':
bot = new Douyin(heroCloudServer);
break;
case 'kuaishou':
bot = new Kuaishou(heroCloudServer);
break;
case 'xigua':
bot = new Xigua(heroCloudServer);
break;
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
case 'douyin':
bot = new Douyin(heroCloudServer);
break;
case 'kuaishou':
bot = new Kuaishou(heroCloudServer);
break;
case 'xigua':
bot = new Xigua(heroCloudServer);
break;
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
}
if (bot) {
@ -62,29 +62,31 @@ import cron from 'node-cron'; @@ -62,29 +62,31 @@ import cron from 'node-cron';
if (
await tajian.saveUrlShortcut(task.id, data)
&& await tajian.saveDescriptionFiles(task.id, data)
) {
) {
taskMoniter.setTaskDone(task.id);
}else {
taskMoniter.setTaskFailed(task.id);
}
}else {
taskMoniter.setTaskFailed(task.id);
}
spider_is_running = false;
}else {
console.error('No bot matched with url %s', task.url);
taskMoniter.setTaskRunning(task.id);
taskMoniter.setTaskFailed(task.id);
}
}, {
scheduled: false
});
spider_is_running = false;
}else {
console.error('No bot matched with url %s', task.url);
taskMoniter.setTaskRunning(task.id);
taskMoniter.setTaskFailed(task.id);
}
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] Spider started.', common.getTimeString());
//TODO: 对接英雄联盟接口:https://herounion.filesite.io
})().catch(error => {
console.error("Spider error got:\n%s", error);
process.exit(1);
});
console.error("Spider error got:\n%s", error);
process.exit(1);
});

18
test/start_cloud.mjs

@ -2,13 +2,13 @@ @@ -2,13 +2,13 @@
import { CloudNode } from '@ulixee/cloud';
(async () => {
const cloudNode = new CloudNode({
port: 1818,
cloudType: 'private'
});
await cloudNode.listen();
console.log(`CloudNode started on port ${await cloudNode.port}`);
const cloudNode = new CloudNode({
port: 1818,
cloudType: 'private'
});
await cloudNode.listen();
console.log(`CloudNode started on port ${await cloudNode.port}`);
})().catch(error => {
console.log('ERROR starting Ulixee CloudNode', error);
process.exit(1);
});
console.log('ERROR starting Ulixee CloudNode', error);
process.exit(1);
});

16
test/tajia_test.mjs

@ -3,11 +3,11 @@ import TaJian from '../lib/tajian.mjs'; @@ -3,11 +3,11 @@ import TaJian from '../lib/tajian.mjs';
(async () => {
const data = {
url: 'https://v.douyin.com/i8sEyb6/',
done: true,
bot: 'douyin',
title: '自由与成功 - 抖音',
cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D'
url: 'https://v.douyin.com/i8sEyb6/',
done: true,
bot: 'douyin',
title: '自由与成功 - 抖音',
cover: '//p6-pc-sign.douyinpic.com/image-cut-tos-priv/d1b1e96513a755b2d6ff4cf8d8260f9b~tplv-dy-resize-origshort-autoq-75:330.jpeg?biz_tag=pcweb_cover&from=3213915784&s=PackSourceEnum_AWEME_DETAIL&sc=cover&se=false&x-expires=2010128400&x-signature=VuJiezXPv7y13fu63Krn9tIbLvQ%3D'
};
const filename = 'douyintest';
@ -20,6 +20,6 @@ import TaJian from '../lib/tajian.mjs'; @@ -20,6 +20,6 @@ import TaJian from '../lib/tajian.mjs';
console.log('descriptions save done', saveDescDone);
})().catch(error => {
console.error("Error got:\n%s", error);
process.exit(1);
});
console.error("Error got:\n%s", error);
process.exit(1);
});
Loading…
Cancel
Save