Browse Source

add bot contracts and contact

master
filesite 9 months ago
parent
commit
1aec6f7866
  1. 15
      README.md
  2. 15
      common.mjs
  3. 21
      heroUnion.mjs
  4. 47
      router_api.mjs
  5. 5
      test/heroUnion.test.mjs

15
README.md

@ -69,6 +69,7 @@ https://herounion.filesite.io/api/newtask/ @@ -69,6 +69,7 @@ https://herounion.filesite.io/api/newtask/
uuid
url
platform
contract
data_mode
selectors
notify_url
@ -79,6 +80,7 @@ sign @@ -79,6 +80,7 @@ sign
参数说明:
* platform: url所属平台,目前支持的:抖音、快手、西瓜视频、bilibili
* contract: 数据抓取合约,目前支持的:tajiantv,可由爬虫自己定义并自己实现合约规则
* data_mode: 返回数据格式,默认:json,可选值:json、html
@ -103,6 +105,7 @@ sign @@ -103,6 +105,7 @@ sign
```
见提交网页抓取任务接口中的参数:notify_url
```
* 数据格式:JSON,返回header:{Content-Type: application/json}
* 请求方法:**POST**
* 请求参数:
```
@ -125,9 +128,12 @@ https://herounion.filesite.io/api/onboard/ @@ -125,9 +128,12 @@ https://herounion.filesite.io/api/onboard/
name
description
status: [idle, busy]
platforms: 爬虫支持的平台,可由爬虫定义,也可参考本文档底部“国内知名平台名称列表”
contracts: 支持的数据抓取合约,具体内容由爬虫定义
timestamp
country
lang
contact: 可选,爬虫提供方联系方式,将在英雄联盟网站展示,便于大家相互联系
```
其中country国家代码和lang语言代码参数值请参考下面标准:
@ -173,6 +179,15 @@ var sign = md5( JSON.stringify(sortedParams) + token ); @@ -173,6 +179,15 @@ var sign = md5( JSON.stringify(sortedParams) + token );
```
## 国内知名平台名称列表
以下平台名可作为爬虫支持的平台参考:
* douyin - 抖音
* kuaishou - 快手
* xigua - 西瓜视频
* bilibili - B站
## Hero Union 英雄联盟开发进度
更新日期:2023-11-05

15
common.mjs

@ -120,6 +120,21 @@ class Common { @@ -120,6 +120,21 @@ class Common {
return /^\w{6,32}$/i.test(bot_name);
}
//检查爬虫支持的平台是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串
isPlatformsOk(platforms) {
return /^[\w,]{3,100}$/i.test(platforms);
}
//检查爬虫支持的合约是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串
isContractsOk(contracts) {
return /^[\w,]{3,100}$/i.test(contracts);
}
//检查爬虫提供方联系方式是否符合标准:6 - 50个非空白字符
isContactOk(contact) {
return /^\S{6,50}$/i.test(contact);
}
getLogArguments() {
let args = [];
let localTime = this.getLocalTimeString('zh-Hans-CN', 'Asia/Shanghai');

21
heroUnion.mjs

@ -126,8 +126,9 @@ class HeroUnion { @@ -126,8 +126,9 @@ class HeroUnion {
* country: '',
* lang: '',
* url: '',
* platform: '',
* data_mode: '', //json, html
* platform: '', //目标网址所属平台,具体参考爬虫所支持的平台
* contract: '', //需要抓取的数据合约,凡是支持此合约的爬虫将根据合约内容抓取数据
* data_mode: '', //json, html
* notify_url: '',
* results: [],
* created: 0, //timestamp in seconds
@ -135,7 +136,7 @@ class HeroUnion { @@ -135,7 +136,7 @@ class HeroUnion {
* error: ''
* }
**/
createTask(uuid, url, platform, data_mode, notify_url, country, lang) {
createTask(uuid, url, platform, contract, data_mode, notify_url, country, lang) {
let timestamp = common.getTimestampInSeconds();
let task = {
@ -146,10 +147,11 @@ class HeroUnion { @@ -146,10 +147,11 @@ class HeroUnion {
uuid: uuid,
url: url,
platform: platform,
contract: contract,
//可选
data_mode: 'default',
country: 'china',
data_mode: 'json',
country: 'cn',
lang: 'zh',
notify_url: '',
results: [],
@ -179,7 +181,7 @@ class HeroUnion { @@ -179,7 +181,7 @@ class HeroUnion {
}
//参数均可选,获取 1 个待处理的任务
getWaitingTask(platform, country, lang, data_mode) {
getWaitingTask(platform, contract, country, lang, data_mode) {
let searchResult = null;
let taskIndex = this.tasks.findIndex(function(item) {
@ -187,6 +189,10 @@ class HeroUnion { @@ -187,6 +189,10 @@ class HeroUnion {
return false;
}
if (typeof(contract) != 'undefined' && contract && item.contract != contract) {
return false;
}
if (typeof(country) != 'undefined' && country && item.country != country) {
return false;
}
@ -291,9 +297,12 @@ class HeroUnion { @@ -291,9 +297,12 @@ class HeroUnion {
* name
* description
* status: [idle, busy]
* platforms: [], //支持的平台,可由爬虫定义
* contracts: [], //支持的数据抓取合约,具体内容由爬虫定义
* timestamp
* country
* lang
* contact
*/
heroOnboard(bot) {
let cachedBotIndex = this.heros.findIndex((item) => item.name == bot.name),

47
router_api.mjs

@ -29,9 +29,33 @@ router.get('/', async (req, res) => { @@ -29,9 +29,33 @@ router.get('/', async (req, res) => {
return res.status(200).json(data);
});
/**
* 参数列表
* uuid: 用户ID
* url: 目标网址
* platform: 目标网址所属平台可选值[douyin, kuaishou, xigua, bilibili]
* contract: '', 需要抓取的数据合约凡是支持此合约的爬虫将根据合约内容抓取数据具体参考爬虫所支持的合约
* data_mode: 返回数据格式可选值[json, html]
* country: 国家代码
* lang: 语言代码
* notify_url: 通知回调网址
* sign: 参数签名签名方法见README.md接口参数签名方法
**/
router.post('/newtask/', async (req, res) => {
let uuid = req.body.uuid,
url = req.body.url,
platform = req.body.platform,
data_mode = req.body.data_mode,
country = req.body.country,
lang = req.body.lang,
notify_url = req.body.notify_url;
return res.send('api/newtask/');
let data = {code: 0, message: ''};
//参数格式检查
return res.status(200).json(data);
});
router.get('/gettask/', async (req, res) => {
@ -54,17 +78,23 @@ router.get('/querytask/', async (req, res) => { @@ -54,17 +78,23 @@ router.get('/querytask/', async (req, res) => {
* name
* description
* status: [idle, busy]
* platforms: '', //支持的平台,可由爬虫定义
* contracts: '', //支持的数据抓取合约,具体内容由爬虫定义
* timestamp
* country
* lang
* contact //爬虫提供方的联系方式
*/
router.post('/onboard/', async (req, res) => {
let bot_name = req.body.name,
bot_desc = req.body.description,
status = req.body.status,
platforms = req.body.platforms, //多个则用英文逗号间隔
contracts = req.body.contracts, //多个则用英文逗号间隔
timestamp = req.body.timestamp,
country = req.body.country,
lang = req.body.lang;
lang = req.body.lang,
contact = req.body.contact;
let data = {
"code": 0,
@ -72,8 +102,8 @@ router.post('/onboard/', async (req, res) => { @@ -72,8 +102,8 @@ router.post('/onboard/', async (req, res) => {
};
//参数格式检查
if (!bot_name || !bot_desc || !status || !timestamp) {
data.message = '必填参数name、description、status、timestamp不能为空';
if (!bot_name || !bot_desc || !status || !timestamp || !platforms || !contracts) {
data.message = '必填参数name、description、status、platforms、contracts、timestamp不能为空';
}else if (common.isBotNameOk(bot_name) == false) {
data.message = '爬虫名字必须是6 - 32位英文字母、下划线的组合';
}else if (typeof(bot_desc) != 'string' || bot_desc.length > 100) {
@ -82,10 +112,16 @@ router.post('/onboard/', async (req, res) => { @@ -82,10 +112,16 @@ router.post('/onboard/', async (req, res) => {
data.message = '爬虫状态status传参错误,其可选值:idle、busy';
}else if (common.isTimestampInSeconds(timestamp) == false) {
data.message = '时间戳timestamp请传秒数';
}else if (common.isPlatformsOk(platforms) == false) {
data.message = '支持的平台platforms应为英文逗号间隔的3 - 100个英文字符串';
}else if (common.isContractsOk(contracts) == false) {
data.message = '支持的合约contracts应为英文逗号间隔的3 - 100个英文字符串';
}else if (country && common.isIosCountryCode(country) == false) {
data.message = '国家代码country请传小写的两位字母,参考两位ISO CODES:https://countrycode.org/';
}else if (lang && common.isIosLangCode(lang) == false) {
data.message = '语言代码lang请传小写的两位字母,参考ISO 639-1 Code:https://www.loc.gov/standards/iso639-2/php/code_list.php';
}else if (contact && common.isContactOk(contact) == false) {
data.message = '联系方式contact应为6 - 50个字符';
}
if (!data.message) {
@ -94,6 +130,9 @@ router.post('/onboard/', async (req, res) => { @@ -94,6 +130,9 @@ router.post('/onboard/', async (req, res) => {
description: bot_desc,
status: status,
timestamp: timestamp,
platforms: platforms.split(','),
contracts: contracts.split(','),
contact: contact,
//如果没传则填充默认值
country: country ? country.toLowerCase() : 'cn',
lang: lang ? lang.toLowerCase() : 'zh'

5
test/heroUnion.test.mjs

@ -29,8 +29,11 @@ test('Hero onboard test', async (t) => { @@ -29,8 +29,11 @@ test('Hero onboard test', async (t) => {
description: 'Hero test 测试爬虫',
status: 'idle',
timestamp: common.getTimestampInSeconds(),
platforms: 'douyin,xigua',
contracts: 'tajiantv',
country: 'cn',
lang: 'zh'
lang: 'zh',
contact: 'https://tajian.tv'
};
let api = 'http://127.0.0.1:8080/api/onboard/';

Loading…
Cancel
Save