const collCounts = (req, res, next, appSlug, colls) => { return (cb) => { async.mapLimit(colls, 4, function(coll, cb2) { new _schema(`${appSlug}.${coll}`).init(req, res, next).get({qt: 'count'}, (err, doc) => { cb2(null, doc); }); }, function(err, results) { cb(null, _.object(colls, results)); }); }; }
app.get('/', function (req, response) { async.mapLimit(urls, 5, function (url, callback) { id++ fetList(url, callback, id) }, function (err, results) { response.send(results) saveToMysql(results) }) })
//京东的抓取太简单,所以换个方式 // async 是一个异步模块,同时发起多个请求 //mapLimit(arr, limit, iterator, [callback]) async.mapLimit(targetUrls,2,function(url,callback){ getHTML(url,callback); },function(err,results){ if(err){ console.log(err); return ; } processResult(results); });
_getAndPutData(sourceEntry, destEntry, log, cb) { log.debug('replicating data', { entry: sourceEntry.getLogInfo() }); if (sourceEntry.getLocation().some(part => { const partObj = new ObjectMDLocation(part); return partObj.getDataStoreETag() === undefined; })) { const errMessage = 'cannot replicate object without dataStoreETag property'; log.error(errMessage, { method: 'ReplicateObject._getAndPutData', entry: sourceEntry.getLogInfo(), }); return cb(errors.InternalError.customizeDescription(errMessage)); } const locations = sourceEntry.getReducedLocations(); return async.mapLimit(locations, MPU_CONC_LIMIT, (part, done) => { this._getAndPutPart(sourceEntry, destEntry, part, log, done); }, cb); }
function main(url) { superagent.get(url) .charset('gbk') //该网站编码为gbk,用到了superagent-charset .end(function (err, res) { console.log(url) var $ = cheerio.load(res.text); //res.text为获取的网页内容,通过cheerio的load方法处理后,之后就是jQuery的语法了 let urls = [] total = $('#list dd').length console.log(`共${$('#list dd').length}章`) $('#list dd').each(function (i, v) { if (i < chapters) { urls.push('http://www.zwdu.com' + $(v).find('a').attr('href')) } }) async.mapLimit(urls, 5, function (url, callback) { id++ fetUrl(url, callback, id) //需要对章节编号,所以通过变量id来计数 }, function (err, results) { saveToMysql(results) }) }) }
async.mapLimit(requests,2,function(request,callback){ getData(request,callback); },function(err,result){ fs.writeFile("jobCount.txt",JSON.stringify(result)); });
/** * 遍历书籍目录下的章节列表 * @param {*} list */ const mapBookList = (list) => { return new Promise((resolve, reject) => { async.mapLimit(list, 1, (series, callback) => { let doc = series._doc; getCurBookSectionList(doc, callback); }, (err, result) => { if (err) { logger.error('书籍目录抓取异步执行出错!'); logger.error(err); reject(false); return; } resolve(true); }) }) }
function startSecondRequest(){ console.log("requestsSecond = ",requestsSecond.length); finishedCnt = 0; async.mapLimit(requestsSecond,2,function(params,callback){ getJobInfo(params,callback); },function(err,result){ var jobs = []; var keys = {}; var repeatKeys = []; for(var i in result){ for (var j in result[i]){ if(typeof keys[result[i][j]["_id"]] != "undefined"){ repeatKeys.push(result[i][j]["_id"]); continue; } keys[result[i][j]["_id"]] = true; jobs.push(result[i][j]); } } console.log("重复岗位数量 :" + repeatKeys.length); console.log("未重复岗位数量 :" + jobs.length); mongodb.insert(jobs); }); }
_createAndPushEntry(objectMds, done) { if (objectMds.length > 0) { return async.mapLimit(objectMds, 10, (objectMd, cb) => { const objectMdEntry = this.createEntry.createPutEntry(objectMd, this._targetZenkoBucket); return cb(null, objectMdEntry); }, (err, entries) => { if (err) { this.log.error('error sending objectMd to kafka', { method: 'IngestionProducer._createAndPushEntry', error: err, }); } return done(err, entries); }); } return done(null, []); }
function getImageUrls(){ async.mapLimit(pageUrls, 1, function (url, callback) { console.log("deal url : ",url); dealPageUrl(url,callback); }, function (err, result) { getImages(); }); }
/** * 遍历url数组 进行并行爬取数据 * 这里需要保存的数据就是一个几十页的目录数组。我认为就是一个很简单的数据对象, * 分分钟就可完成抓取,所以在保存时是以整体数据为单位进行保存的。 没有去做查询去重(这样做把实际上是简单的事情变复杂了), * 是否当前某条数据抓取错误等等操作。如果抓取失败,首先可以通过日志找到是哪些书籍,然后手动删除集合重新抓取即可 * @param {*} pageUrlList * @param {*} callback 这个参数如果传入,书籍列表抓取完成后可执行下一步操作。 */ const getBookList = (pageUrlList) => { return new Promise((resolve, reject) => { async.mapLimit(pageUrlList, 3, (series, callback) => { getCurPage(series, callback) }, (err, result) => { if (err) { logger.error('书籍目录抓取异步执行出错!'); logger.error(err); reject(false); return; } let booklist = getNewBookListArray(result); saveDB(booklist, resolve); }) }) }
function getImages(){ async.mapLimit(imageUrls, 1, function (url, callback) { dealImageUrl(url,callback); }, function (err, result) { }); }
async.mapLimit(urls, 5, function (url, callback) { fetchUrl(url, callback); }, function (err, result) { console.log('final:'); console.log(result); });
async.mapLimit(requestsFirst,2,function(request,callback){ getData(request,callback); },function(err,result){ startSecondRequest(); });
app.get('/', function (req, response) { async.mapLimit(urls, 5, function (url, callback) { id++ fetList(url, callback, id) }, function (err, results) { response.send(results) saveToMysql(results) }) })