Browse Source

add bot contracts and contact

master
filesite 8 months ago
parent
commit
1aec6f7866
  1. 15
      README.md
  2. 15
      common.mjs
  3. 21
      heroUnion.mjs
  4. 47
      router_api.mjs
  5. 5
      test/heroUnion.test.mjs

15
README.md

@ -69,6 +69,7 @@ https://herounion.filesite.io/api/newtask/
uuid uuid
url url
platform platform
contract
data_mode data_mode
selectors selectors
notify_url notify_url
@ -79,6 +80,7 @@ sign
参数说明: 参数说明:
* platform: url所属平台,目前支持的:抖音、快手、西瓜视频、bilibili * platform: url所属平台,目前支持的:抖音、快手、西瓜视频、bilibili
* contract: 数据抓取合约,目前支持的:tajiantv,可由爬虫自己定义并自己实现合约规则
* data_mode: 返回数据格式,默认:json,可选值:json、html * data_mode: 返回数据格式,默认:json,可选值:json、html
@ -103,6 +105,7 @@ sign
``` ```
见提交网页抓取任务接口中的参数:notify_url 见提交网页抓取任务接口中的参数:notify_url
``` ```
* 数据格式:JSON,返回header:{Content-Type: application/json}
* 请求方法:**POST** * 请求方法:**POST**
* 请求参数: * 请求参数:
``` ```
@ -125,9 +128,12 @@ https://herounion.filesite.io/api/onboard/
name name
description description
status: [idle, busy] status: [idle, busy]
platforms: 爬虫支持的平台,可由爬虫定义,也可参考本文档底部“国内知名平台名称列表”
contracts: 支持的数据抓取合约,具体内容由爬虫定义
timestamp timestamp
country country
lang lang
contact: 可选,爬虫提供方联系方式,将在英雄联盟网站展示,便于大家相互联系
``` ```
其中country国家代码和lang语言代码参数值请参考下面标准: 其中country国家代码和lang语言代码参数值请参考下面标准:
@ -173,6 +179,15 @@ var sign = md5( JSON.stringify(sortedParams) + token );
``` ```
## 国内知名平台名称列表
以下平台名可作为爬虫支持的平台参考:
* douyin - 抖音
* kuaishou - 快手
* xigua - 西瓜视频
* bilibili - B站
## Hero Union 英雄联盟开发进度 ## Hero Union 英雄联盟开发进度
更新日期:2023-11-05 更新日期:2023-11-05

15
common.mjs

@ -120,6 +120,21 @@ class Common {
return /^\w{6,32}$/i.test(bot_name); return /^\w{6,32}$/i.test(bot_name);
} }
//检查爬虫支持的平台是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串
isPlatformsOk(platforms) {
return /^[\w,]{3,100}$/i.test(platforms);
}
//检查爬虫支持的合约是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串
isContractsOk(contracts) {
return /^[\w,]{3,100}$/i.test(contracts);
}
//检查爬虫提供方联系方式是否符合标准:6 - 50个非空白字符
isContactOk(contact) {
return /^\S{6,50}$/i.test(contact);
}
getLogArguments() { getLogArguments() {
let args = []; let args = [];
let localTime = this.getLocalTimeString('zh-Hans-CN', 'Asia/Shanghai'); let localTime = this.getLocalTimeString('zh-Hans-CN', 'Asia/Shanghai');

21
heroUnion.mjs

@ -126,8 +126,9 @@ class HeroUnion {
* country: '', * country: '',
* lang: '', * lang: '',
* url: '', * url: '',
* platform: '', * platform: '', //目标网址所属平台,具体参考爬虫所支持的平台
* data_mode: '', //json, html * contract: '', //需要抓取的数据合约,凡是支持此合约的爬虫将根据合约内容抓取数据
* data_mode: '', //json, html
* notify_url: '', * notify_url: '',
* results: [], * results: [],
* created: 0, //timestamp in seconds * created: 0, //timestamp in seconds
@ -135,7 +136,7 @@ class HeroUnion {
* error: '' * error: ''
* } * }
**/ **/
createTask(uuid, url, platform, data_mode, notify_url, country, lang) { createTask(uuid, url, platform, contract, data_mode, notify_url, country, lang) {
let timestamp = common.getTimestampInSeconds(); let timestamp = common.getTimestampInSeconds();
let task = { let task = {
@ -146,10 +147,11 @@ class HeroUnion {
uuid: uuid, uuid: uuid,
url: url, url: url,
platform: platform, platform: platform,
contract: contract,
//可选 //可选
data_mode: 'default', data_mode: 'json',
country: 'china', country: 'cn',
lang: 'zh', lang: 'zh',
notify_url: '', notify_url: '',
results: [], results: [],
@ -179,7 +181,7 @@ class HeroUnion {
} }
//参数均可选,获取 1 个待处理的任务 //参数均可选,获取 1 个待处理的任务
getWaitingTask(platform, country, lang, data_mode) { getWaitingTask(platform, contract, country, lang, data_mode) {
let searchResult = null; let searchResult = null;
let taskIndex = this.tasks.findIndex(function(item) { let taskIndex = this.tasks.findIndex(function(item) {
@ -187,6 +189,10 @@ class HeroUnion {
return false; return false;
} }
if (typeof(contract) != 'undefined' && contract && item.contract != contract) {
return false;
}
if (typeof(country) != 'undefined' && country && item.country != country) { if (typeof(country) != 'undefined' && country && item.country != country) {
return false; return false;
} }
@ -291,9 +297,12 @@ class HeroUnion {
* name * name
* description * description
* status: [idle, busy] * status: [idle, busy]
* platforms: [], //支持的平台,可由爬虫定义
* contracts: [], //支持的数据抓取合约,具体内容由爬虫定义
* timestamp * timestamp
* country * country
* lang * lang
* contact
*/ */
heroOnboard(bot) { heroOnboard(bot) {
let cachedBotIndex = this.heros.findIndex((item) => item.name == bot.name), let cachedBotIndex = this.heros.findIndex((item) => item.name == bot.name),

47
router_api.mjs

@ -29,9 +29,33 @@ router.get('/', async (req, res) => {
return res.status(200).json(data); return res.status(200).json(data);
}); });
/**
* 参数列表
* uuid: 用户ID
* url: 目标网址
* platform: 目标网址所属平台可选值[douyin, kuaishou, xigua, bilibili]
* contract: '', 需要抓取的数据合约凡是支持此合约的爬虫将根据合约内容抓取数据具体参考爬虫所支持的合约
* data_mode: 返回数据格式可选值[json, html]
* country: 国家代码
* lang: 语言代码
* notify_url: 通知回调网址
* sign: 参数签名签名方法见README.md接口参数签名方法
**/
router.post('/newtask/', async (req, res) => { router.post('/newtask/', async (req, res) => {
let uuid = req.body.uuid,
url = req.body.url,
platform = req.body.platform,
data_mode = req.body.data_mode,
country = req.body.country,
lang = req.body.lang,
notify_url = req.body.notify_url;
let data = {code: 0, message: ''};
return res.send('api/newtask/'); //参数格式检查
return res.status(200).json(data);
}); });
router.get('/gettask/', async (req, res) => { router.get('/gettask/', async (req, res) => {
@ -54,17 +78,23 @@ router.get('/querytask/', async (req, res) => {
* name * name
* description * description
* status: [idle, busy] * status: [idle, busy]
* platforms: '', //支持的平台,可由爬虫定义
* contracts: '', //支持的数据抓取合约,具体内容由爬虫定义
* timestamp * timestamp
* country * country
* lang * lang
* contact //爬虫提供方的联系方式
*/ */
router.post('/onboard/', async (req, res) => { router.post('/onboard/', async (req, res) => {
let bot_name = req.body.name, let bot_name = req.body.name,
bot_desc = req.body.description, bot_desc = req.body.description,
status = req.body.status, status = req.body.status,
platforms = req.body.platforms, //多个则用英文逗号间隔
contracts = req.body.contracts, //多个则用英文逗号间隔
timestamp = req.body.timestamp, timestamp = req.body.timestamp,
country = req.body.country, country = req.body.country,
lang = req.body.lang; lang = req.body.lang,
contact = req.body.contact;
let data = { let data = {
"code": 0, "code": 0,
@ -72,8 +102,8 @@ router.post('/onboard/', async (req, res) => {
}; };
//参数格式检查 //参数格式检查
if (!bot_name || !bot_desc || !status || !timestamp) { if (!bot_name || !bot_desc || !status || !timestamp || !platforms || !contracts) {
data.message = '必填参数name、description、status、timestamp不能为空'; data.message = '必填参数name、description、status、platforms、contracts、timestamp不能为空';
}else if (common.isBotNameOk(bot_name) == false) { }else if (common.isBotNameOk(bot_name) == false) {
data.message = '爬虫名字必须是6 - 32位英文字母、下划线的组合'; data.message = '爬虫名字必须是6 - 32位英文字母、下划线的组合';
}else if (typeof(bot_desc) != 'string' || bot_desc.length > 100) { }else if (typeof(bot_desc) != 'string' || bot_desc.length > 100) {
@ -82,10 +112,16 @@ router.post('/onboard/', async (req, res) => {
data.message = '爬虫状态status传参错误,其可选值:idle、busy'; data.message = '爬虫状态status传参错误,其可选值:idle、busy';
}else if (common.isTimestampInSeconds(timestamp) == false) { }else if (common.isTimestampInSeconds(timestamp) == false) {
data.message = '时间戳timestamp请传秒数'; data.message = '时间戳timestamp请传秒数';
}else if (common.isPlatformsOk(platforms) == false) {
data.message = '支持的平台platforms应为英文逗号间隔的3 - 100个英文字符串';
}else if (common.isContractsOk(contracts) == false) {
data.message = '支持的合约contracts应为英文逗号间隔的3 - 100个英文字符串';
}else if (country && common.isIosCountryCode(country) == false) { }else if (country && common.isIosCountryCode(country) == false) {
data.message = '国家代码country请传小写的两位字母,参考两位ISO CODES:https://countrycode.org/'; data.message = '国家代码country请传小写的两位字母,参考两位ISO CODES:https://countrycode.org/';
}else if (lang && common.isIosLangCode(lang) == false) { }else if (lang && common.isIosLangCode(lang) == false) {
data.message = '语言代码lang请传小写的两位字母,参考ISO 639-1 Code:https://www.loc.gov/standards/iso639-2/php/code_list.php'; data.message = '语言代码lang请传小写的两位字母,参考ISO 639-1 Code:https://www.loc.gov/standards/iso639-2/php/code_list.php';
}else if (contact && common.isContactOk(contact) == false) {
data.message = '联系方式contact应为6 - 50个字符';
} }
if (!data.message) { if (!data.message) {
@ -94,6 +130,9 @@ router.post('/onboard/', async (req, res) => {
description: bot_desc, description: bot_desc,
status: status, status: status,
timestamp: timestamp, timestamp: timestamp,
platforms: platforms.split(','),
contracts: contracts.split(','),
contact: contact,
//如果没传则填充默认值 //如果没传则填充默认值
country: country ? country.toLowerCase() : 'cn', country: country ? country.toLowerCase() : 'cn',
lang: lang ? lang.toLowerCase() : 'zh' lang: lang ? lang.toLowerCase() : 'zh'

5
test/heroUnion.test.mjs

@ -29,8 +29,11 @@ test('Hero onboard test', async (t) => {
description: 'Hero test 测试爬虫', description: 'Hero test 测试爬虫',
status: 'idle', status: 'idle',
timestamp: common.getTimestampInSeconds(), timestamp: common.getTimestampInSeconds(),
platforms: 'douyin,xigua',
contracts: 'tajiantv',
country: 'cn', country: 'cn',
lang: 'zh' lang: 'zh',
contact: 'https://tajian.tv'
}; };
let api = 'http://127.0.0.1:8080/api/onboard/'; let api = 'http://127.0.0.1:8080/api/onboard/';

Loading…
Cancel
Save