From 7ee17c68b6ab66d7770452db99b27ac340e427a0 Mon Sep 17 00:00:00 2001 From: filesite Date: Thu, 16 May 2024 08:24:38 +0800 Subject: [PATCH] add bot for all websites --- .gitignore | 1 + README.md | 15 +++++++++++++-- cloud.mjs | 14 ++++++++++++++ config.mjs | 2 +- install_cloud.sh | 2 +- lib/common.mjs | 2 +- spider.mjs | 10 +++++++--- start_cloud.sh | 7 +++++++ test/cloud_test.mjs | 2 +- test/scrap_test.mjs | 3 ++- 10 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 cloud.mjs create mode 100755 start_cloud.sh diff --git a/.gitignore b/.gitignore index a16a9d1..f5fe7f8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ data/*.jpg data/*.jpeg data/*.png data/*.txt +log/tasks_machete_hero.log diff --git a/README.md b/README.md index 64807f2..6dbac27 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,13 @@ npm install 如果你对npm和node不熟悉,请自行了解。 -3. 执行下面命令启动爬虫 +3. 执行下面命令启动Hero Cloud +``` +./start_cloud.sh +``` + + +4. 执行下面命令启动爬虫spider.mjs ``` npm start ``` @@ -64,8 +70,13 @@ npm start -- config_custom.json 在目录todo/里创建任务文件,爬虫检测到新任务后自动抓取数据并保存到data/目录下。 +手动添加任务命令示例: +``` +echo "https://tajian.tv" > todo/test_01.task +``` + -4. 二次开发 +## 二次开发 写一个.mjs脚本,调用bot/下的类库,实现目标网页访问和解析获取所需数据。 diff --git a/cloud.mjs b/cloud.mjs new file mode 100644 index 0000000..70d6265 --- /dev/null +++ b/cloud.mjs @@ -0,0 +1,14 @@ +//Documents: https://ulixee.org/docs/cloud/modules/cloud-node#constructor +import { CloudNode } from '@ulixee/cloud'; + +(async () => { + const cloudNode = new CloudNode({ + port: 1818, + cloudType: 'private' + }); + await cloudNode.listen(); + console.log(`CloudNode started on port ${await cloudNode.port}`); +})().catch(error => { + console.log('ERROR starting Ulixee CloudNode', error); + process.exit(1); +}); \ No newline at end of file diff --git a/config.mjs b/config.mjs index b67fbe5..efb6b80 100644 --- a/config.mjs +++ b/config.mjs @@ -26,7 +26,7 @@ let configs = { //bot相关配置 - cloud_server: '', + cloud_server: 'ws://127.0.0.1:1818', default_mode: 'pc', userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', viewport: { diff --git a/install_cloud.sh b/install_cloud.sh index 4e67b80..a6857d1 100644 --- a/install_cloud.sh +++ b/install_cloud.sh @@ -5,7 +5,7 @@ npm i --save @ulixee/cloud # start cloud #npx @ulixee/cloud start -node test/start_cloud.mjs +#node test/start_cloud.mjs echo "Manual url:" echo "https://ulixee.org/docs/cloud/" diff --git a/lib/common.mjs b/lib/common.mjs index e2284e5..4d81f57 100644 --- a/lib/common.mjs +++ b/lib/common.mjs @@ -27,7 +27,7 @@ export default { }, getBotName: function(url) { - let botName = 'website'; + let botName = 'webcrawler'; if (/douyin\.com/ig.test(url)) { botName = 'douyin'; diff --git a/spider.mjs b/spider.mjs index a7f3f15..db9e592 100644 --- a/spider.mjs +++ b/spider.mjs @@ -15,6 +15,7 @@ import Douyin from './bot/Douyin.mjs'; import Kuaishou from './bot/Kuaishou.mjs'; import Xigua from './bot/Xigua.mjs'; import Bilibili from './bot/Bilibili.mjs'; +import WebCrawler from './bot/WebCrawler.mjs'; import cron from 'node-cron'; import path from 'node:path'; @@ -65,7 +66,7 @@ import path from 'node:path'; await common.saveLog(logFile, JSON.stringify(task) + "\n"); const botName = common.getBotName(task.url); - console.log('New task %s handle by bot %s.', task.url, botName); + console.log('New task %s handle by bot %s, url: %s, cloud server: %s', task.id, botName, task.url, heroCloudServer); let bot = null; switch (botName) { case 'douyin': @@ -81,16 +82,19 @@ import path from 'node:path'; case 'bilibili': bot = new Bilibili(heroCloudServer); break; + default: - bot = = new WebCrawler(heroCloudServer, 'webcrawler'); + bot = new WebCrawler(heroCloudServer, botName); break; } if (bot) { + console.log('Spider craping...'); + spider_is_running = true; last_run_time = common.getTimestampInSeconds(); - taskMoniter.setTaskRunning(task.id); + let taskStarted = taskMoniter.setTaskRunning(task.id); const data = await bot.scrap(task.url); //console.log('Data got by bot', data); diff --git a/start_cloud.sh b/start_cloud.sh new file mode 100755 index 0000000..4b71222 --- /dev/null +++ b/start_cloud.sh @@ -0,0 +1,7 @@ +#!/bin/sh + + +# start cloud +#npx @ulixee/cloud start +node cloud.mjs + diff --git a/test/cloud_test.mjs b/test/cloud_test.mjs index 73c6186..4e1761e 100644 --- a/test/cloud_test.mjs +++ b/test/cloud_test.mjs @@ -1,7 +1,7 @@ import Hero from '@ulixee/hero'; (async () => { - const hero = new Hero({ connectionToCore: 'ws://192.168.3.13:1818' }); + const hero = new Hero({ connectionToCore: 'ws://127.0.0.1:1818' }); //const url = 'https://filesite.io'; //const url = 'https://www.google.com'; diff --git a/test/scrap_test.mjs b/test/scrap_test.mjs index bf2b212..20e851a 100644 --- a/test/scrap_test.mjs +++ b/test/scrap_test.mjs @@ -15,7 +15,7 @@ import getConfigs from '../config.mjs'; } console.log('当前测试Bot:%s', test_bot); - const heroCloudServer = 'ws://127.0.0.1:1818'; + let heroCloudServer = 'ws://127.0.0.1:1818'; let url = '', data = {}; switch(test_bot) { @@ -109,6 +109,7 @@ import getConfigs from '../config.mjs'; url = 'https://www.baidu.com'; url = 'https://www.zhihu.com'; url = 'https://ogp.me/'; + url = 'https://www.zhihu.com/signin?next=%2F'; configs.heroTabOptions.timeoutMs = 20000; //所有内容加载完成超时