From 1aec6f786608793f44c602111aa6d336a455ebe2 Mon Sep 17 00:00:00 2001 From: filesite Date: Tue, 9 Apr 2024 19:53:12 +0800 Subject: [PATCH] add bot contracts and contact --- README.md | 15 +++++++++++++ common.mjs | 15 +++++++++++++ heroUnion.mjs | 21 ++++++++++++------ router_api.mjs | 47 +++++++++++++++++++++++++++++++++++++---- test/heroUnion.test.mjs | 5 ++++- 5 files changed, 92 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4b56115..49bb4c1 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ https://herounion.filesite.io/api/newtask/ uuid url platform +contract data_mode selectors notify_url @@ -79,6 +80,7 @@ sign 参数说明: * platform: url所属平台,目前支持的:抖音、快手、西瓜视频、bilibili +* contract: 数据抓取合约,目前支持的:tajiantv,可由爬虫自己定义并自己实现合约规则 * data_mode: 返回数据格式,默认:json,可选值:json、html @@ -103,6 +105,7 @@ sign ``` 见提交网页抓取任务接口中的参数:notify_url ``` +* 数据格式:JSON,返回header:{Content-Type: application/json} * 请求方法:**POST** * 请求参数: ``` @@ -125,9 +128,12 @@ https://herounion.filesite.io/api/onboard/ name description status: [idle, busy] +platforms: 爬虫支持的平台,可由爬虫定义,也可参考本文档底部“国内知名平台名称列表” +contracts: 支持的数据抓取合约,具体内容由爬虫定义 timestamp country lang +contact: 可选,爬虫提供方联系方式,将在英雄联盟网站展示,便于大家相互联系 ``` 其中country国家代码和lang语言代码参数值请参考下面标准: @@ -173,6 +179,15 @@ var sign = md5( JSON.stringify(sortedParams) + token ); ``` +## 国内知名平台名称列表 + +以下平台名可作为爬虫支持的平台参考: +* douyin - 抖音 +* kuaishou - 快手 +* xigua - 西瓜视频 +* bilibili - B站 + + ## Hero Union 英雄联盟开发进度 更新日期:2023-11-05 diff --git a/common.mjs b/common.mjs index 1d1e6d1..011bdf7 100644 --- a/common.mjs +++ b/common.mjs @@ -120,6 +120,21 @@ class Common { return /^\w{6,32}$/i.test(bot_name); } + //检查爬虫支持的平台是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串 + isPlatformsOk(platforms) { + return /^[\w,]{3,100}$/i.test(platforms); + } + + //检查爬虫支持的合约是否符合标准:用英文逗号间隔的最长3 - 100个字符的英文字符串 + isContractsOk(contracts) { + return /^[\w,]{3,100}$/i.test(contracts); + } + + //检查爬虫提供方联系方式是否符合标准:6 - 50个非空白字符 + isContactOk(contact) { + return /^\S{6,50}$/i.test(contact); + } + getLogArguments() { let args = []; let localTime = this.getLocalTimeString('zh-Hans-CN', 'Asia/Shanghai'); diff --git a/heroUnion.mjs b/heroUnion.mjs index 30d8b5b..31a04e4 100644 --- a/heroUnion.mjs +++ b/heroUnion.mjs @@ -126,8 +126,9 @@ class HeroUnion { * country: '', * lang: '', * url: '', - * platform: '', - * data_mode: '', //json, html + * platform: '', //目标网址所属平台,具体参考爬虫所支持的平台 + * contract: '', //需要抓取的数据合约,凡是支持此合约的爬虫将根据合约内容抓取数据 + * data_mode: '', //json, html * notify_url: '', * results: [], * created: 0, //timestamp in seconds @@ -135,7 +136,7 @@ class HeroUnion { * error: '' * } **/ - createTask(uuid, url, platform, data_mode, notify_url, country, lang) { + createTask(uuid, url, platform, contract, data_mode, notify_url, country, lang) { let timestamp = common.getTimestampInSeconds(); let task = { @@ -146,10 +147,11 @@ class HeroUnion { uuid: uuid, url: url, platform: platform, + contract: contract, //可选 - data_mode: 'default', - country: 'china', + data_mode: 'json', + country: 'cn', lang: 'zh', notify_url: '', results: [], @@ -179,7 +181,7 @@ class HeroUnion { } //参数均可选,获取 1 个待处理的任务 - getWaitingTask(platform, country, lang, data_mode) { + getWaitingTask(platform, contract, country, lang, data_mode) { let searchResult = null; let taskIndex = this.tasks.findIndex(function(item) { @@ -187,6 +189,10 @@ class HeroUnion { return false; } + if (typeof(contract) != 'undefined' && contract && item.contract != contract) { + return false; + } + if (typeof(country) != 'undefined' && country && item.country != country) { return false; } @@ -291,9 +297,12 @@ class HeroUnion { * name * description * status: [idle, busy] + * platforms: [], //支持的平台,可由爬虫定义 + * contracts: [], //支持的数据抓取合约,具体内容由爬虫定义 * timestamp * country * lang + * contact */ heroOnboard(bot) { let cachedBotIndex = this.heros.findIndex((item) => item.name == bot.name), diff --git a/router_api.mjs b/router_api.mjs index 490c116..ca823d3 100644 --- a/router_api.mjs +++ b/router_api.mjs @@ -29,9 +29,33 @@ router.get('/', async (req, res) => { return res.status(200).json(data); }); +/** + * 参数列表 + * uuid: 用户ID + * url: 目标网址 + * platform: 目标网址所属平台,可选值:[douyin, kuaishou, xigua, bilibili] + * contract: '', 需要抓取的数据合约,凡是支持此合约的爬虫将根据合约内容抓取数据(具体参考爬虫所支持的合约) + * data_mode: 返回数据格式,可选值:[json, html] + * country: 国家代码 + * lang: 语言代码 + * notify_url: 通知回调网址 + * sign: 参数签名,签名方法见README.md“接口参数签名方法” + **/ router.post('/newtask/', async (req, res) => { + let uuid = req.body.uuid, + url = req.body.url, + platform = req.body.platform, + data_mode = req.body.data_mode, + country = req.body.country, + lang = req.body.lang, + notify_url = req.body.notify_url; - return res.send('api/newtask/'); + let data = {code: 0, message: ''}; + + //参数格式检查 + + + return res.status(200).json(data); }); router.get('/gettask/', async (req, res) => { @@ -54,17 +78,23 @@ router.get('/querytask/', async (req, res) => { * name * description * status: [idle, busy] + * platforms: '', //支持的平台,可由爬虫定义 + * contracts: '', //支持的数据抓取合约,具体内容由爬虫定义 * timestamp * country * lang + * contact //爬虫提供方的联系方式 */ router.post('/onboard/', async (req, res) => { let bot_name = req.body.name, bot_desc = req.body.description, status = req.body.status, + platforms = req.body.platforms, //多个则用英文逗号间隔 + contracts = req.body.contracts, //多个则用英文逗号间隔 timestamp = req.body.timestamp, country = req.body.country, - lang = req.body.lang; + lang = req.body.lang, + contact = req.body.contact; let data = { "code": 0, @@ -72,8 +102,8 @@ router.post('/onboard/', async (req, res) => { }; //参数格式检查 - if (!bot_name || !bot_desc || !status || !timestamp) { - data.message = '必填参数name、description、status、timestamp不能为空'; + if (!bot_name || !bot_desc || !status || !timestamp || !platforms || !contracts) { + data.message = '必填参数name、description、status、platforms、contracts、timestamp不能为空'; }else if (common.isBotNameOk(bot_name) == false) { data.message = '爬虫名字必须是6 - 32位英文字母、下划线的组合'; }else if (typeof(bot_desc) != 'string' || bot_desc.length > 100) { @@ -82,10 +112,16 @@ router.post('/onboard/', async (req, res) => { data.message = '爬虫状态status传参错误,其可选值:idle、busy'; }else if (common.isTimestampInSeconds(timestamp) == false) { data.message = '时间戳timestamp请传秒数'; + }else if (common.isPlatformsOk(platforms) == false) { + data.message = '支持的平台platforms应为英文逗号间隔的3 - 100个英文字符串'; + }else if (common.isContractsOk(contracts) == false) { + data.message = '支持的合约contracts应为英文逗号间隔的3 - 100个英文字符串'; }else if (country && common.isIosCountryCode(country) == false) { data.message = '国家代码country请传小写的两位字母,参考两位ISO CODES:https://countrycode.org/'; }else if (lang && common.isIosLangCode(lang) == false) { data.message = '语言代码lang请传小写的两位字母,参考ISO 639-1 Code:https://www.loc.gov/standards/iso639-2/php/code_list.php'; + }else if (contact && common.isContactOk(contact) == false) { + data.message = '联系方式contact应为6 - 50个字符'; } if (!data.message) { @@ -94,6 +130,9 @@ router.post('/onboard/', async (req, res) => { description: bot_desc, status: status, timestamp: timestamp, + platforms: platforms.split(','), + contracts: contracts.split(','), + contact: contact, //如果没传则填充默认值 country: country ? country.toLowerCase() : 'cn', lang: lang ? lang.toLowerCase() : 'zh' diff --git a/test/heroUnion.test.mjs b/test/heroUnion.test.mjs index 1d31ddf..238249f 100644 --- a/test/heroUnion.test.mjs +++ b/test/heroUnion.test.mjs @@ -29,8 +29,11 @@ test('Hero onboard test', async (t) => { description: 'Hero test 测试爬虫', status: 'idle', timestamp: common.getTimestampInSeconds(), + platforms: 'douyin,xigua', + contracts: 'tajiantv', country: 'cn', - lang: 'zh' + lang: 'zh', + contact: 'https://tajian.tv' }; let api = 'http://127.0.0.1:8080/api/onboard/';