Browse Source

add bot for all websites

master
filesite 8 months ago
parent
commit
7ee17c68b6
  1. 1
      .gitignore
  2. 15
      README.md
  3. 14
      cloud.mjs
  4. 2
      config.mjs
  5. 2
      install_cloud.sh
  6. 2
      lib/common.mjs
  7. 10
      spider.mjs
  8. 7
      start_cloud.sh
  9. 2
      test/cloud_test.mjs
  10. 3
      test/scrap_test.mjs

1
.gitignore vendored

@ -8,3 +8,4 @@ data/*.jpg @@ -8,3 +8,4 @@ data/*.jpg
data/*.jpeg
data/*.png
data/*.txt
log/tasks_machete_hero.log

15
README.md

@ -52,7 +52,13 @@ npm install @@ -52,7 +52,13 @@ npm install
如果你对npm和node不熟悉,请自行了解。
3. 执行下面命令启动爬虫
3. 执行下面命令启动Hero Cloud
```
./start_cloud.sh
```
4. 执行下面命令启动爬虫spider.mjs
```
npm start
```
@ -64,8 +70,13 @@ npm start -- config_custom.json @@ -64,8 +70,13 @@ npm start -- config_custom.json
在目录todo/里创建任务文件,爬虫检测到新任务后自动抓取数据并保存到data/目录下。
手动添加任务命令示例:
```
echo "https://tajian.tv" > todo/test_01.task
```
4. 二次开发
## 二次开发
写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。

14
cloud.mjs

@ -0,0 +1,14 @@ @@ -0,0 +1,14 @@
//Documents: https://ulixee.org/docs/cloud/modules/cloud-node#constructor
import { CloudNode } from '@ulixee/cloud';
(async () => {
const cloudNode = new CloudNode({
port: 1818,
cloudType: 'private'
});
await cloudNode.listen();
console.log(`CloudNode started on port ${await cloudNode.port}`);
})().catch(error => {
console.log('ERROR starting Ulixee CloudNode', error);
process.exit(1);
});

2
config.mjs

@ -26,7 +26,7 @@ let configs = { @@ -26,7 +26,7 @@ let configs = {
//bot相关配置
cloud_server: '',
cloud_server: 'ws://127.0.0.1:1818',
default_mode: 'pc',
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
viewport: {

2
install_cloud.sh

@ -5,7 +5,7 @@ npm i --save @ulixee/cloud @@ -5,7 +5,7 @@ npm i --save @ulixee/cloud
# start cloud
#npx @ulixee/cloud start
node test/start_cloud.mjs
#node test/start_cloud.mjs
echo "Manual url:"
echo "https://ulixee.org/docs/cloud/"

2
lib/common.mjs

@ -27,7 +27,7 @@ export default { @@ -27,7 +27,7 @@ export default {
},
getBotName: function(url) {
let botName = 'website';
let botName = 'webcrawler';
if (/douyin\.com/ig.test(url)) {
botName = 'douyin';

10
spider.mjs

@ -15,6 +15,7 @@ import Douyin from './bot/Douyin.mjs'; @@ -15,6 +15,7 @@ import Douyin from './bot/Douyin.mjs';
import Kuaishou from './bot/Kuaishou.mjs';
import Xigua from './bot/Xigua.mjs';
import Bilibili from './bot/Bilibili.mjs';
import WebCrawler from './bot/WebCrawler.mjs';
import cron from 'node-cron';
import path from 'node:path';
@ -65,7 +66,7 @@ import path from 'node:path'; @@ -65,7 +66,7 @@ import path from 'node:path';
await common.saveLog(logFile, JSON.stringify(task) + "\n");
const botName = common.getBotName(task.url);
console.log('New task %s handle by bot %s.', task.url, botName);
console.log('New task %s handle by bot %s, url: %s, cloud server: %s', task.id, botName, task.url, heroCloudServer);
let bot = null;
switch (botName) {
case 'douyin':
@ -81,16 +82,19 @@ import path from 'node:path'; @@ -81,16 +82,19 @@ import path from 'node:path';
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
default:
bot = = new WebCrawler(heroCloudServer, 'webcrawler');
bot = new WebCrawler(heroCloudServer, botName);
break;
}
if (bot) {
console.log('Spider craping...');
spider_is_running = true;
last_run_time = common.getTimestampInSeconds();
taskMoniter.setTaskRunning(task.id);
let taskStarted = taskMoniter.setTaskRunning(task.id);
const data = await bot.scrap(task.url);
//console.log('Data got by bot', data);

7
start_cloud.sh

@ -0,0 +1,7 @@ @@ -0,0 +1,7 @@
#!/bin/sh
# start cloud
#npx @ulixee/cloud start
node cloud.mjs

2
test/cloud_test.mjs

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
import Hero from '@ulixee/hero';
(async () => {
const hero = new Hero({ connectionToCore: 'ws://192.168.3.13:1818' });
const hero = new Hero({ connectionToCore: 'ws://127.0.0.1:1818' });
//const url = 'https://filesite.io';
//const url = 'https://www.google.com';

3
test/scrap_test.mjs

@ -15,7 +15,7 @@ import getConfigs from '../config.mjs'; @@ -15,7 +15,7 @@ import getConfigs from '../config.mjs';
}
console.log('当前测试Bot:%s', test_bot);
const heroCloudServer = 'ws://127.0.0.1:1818';
let heroCloudServer = 'ws://127.0.0.1:1818';
let url = '', data = {};
switch(test_bot) {
@ -109,6 +109,7 @@ import getConfigs from '../config.mjs'; @@ -109,6 +109,7 @@ import getConfigs from '../config.mjs';
url = 'https://www.baidu.com';
url = 'https://www.zhihu.com';
url = 'https://ogp.me/';
url = 'https://www.zhihu.com/signin?next=%2F';
configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时

Loading…
Cancel
Save