7 Star 0 Fork 0

gb-dev / spiderPic

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
fetch-imgs.js 10.37 KB
一键复制 编辑 原始数据 按行查看 历史
huzorro 提交于 2017-03-14 19:05 . 抓取常规版面美女图片
var fetch = require('node-fetch')
var cheerio = require('cheerio')
var path = require('path')
var fs = require('fs')
var rd = require('rd');
const util = require('util')
var moment = require('moment')
var UpYun = require('upyun')
var Nightmare = require('nightmare');
var async = require("async")
var nightmare = Nightmare({
show: false
// openDevTools: true
})
//sqllite setting
var Sequelize = require('sequelize');
var sequelize = new Sequelize('sqlite:spider-pic.sqlite3', {
dialect: 'sqlite',
storage: './spider-pic.sqlite3'
})
//使用CSS选择器获取待抓取img的src属性并剔除开头是//的字符
var parseImgsByReplace = exports.fetchImgsByReplace = function (content, selector, attr) {
var $ = cheerio.load(content)
var imgs = []
$(selector).each(function (index, img) {
if ($(img).attr(attr).startsWith("//")) {
imgs.push($(img).attr(attr).replace("//", "http://"))
}
})
return imgs
}
//使用CSS选择器获取待抓取img的src属性
var parseImgs = exports.parseImgs = function (content, selector, attr) {
var $ = cheerio.load(content)
var imgs = []
$(selector).each(function (index, img) {
imgs.push($(img).attr(attr))
})
return imgs
}
//抓取远程图片并保存到本地
var saveImg = exports.saveImg = async function saveImg(img, directory) {
var basename = path.basename(img)
var filePath = path.join(directory, basename)
console.time(filePath)
var res = await fetch(img)
var writeStream = fs.createWriteStream(filePath)
writeStream.on('finish', console.timeEnd.bind(console, filePath))
res.body.pipe(writeStream)
}
//获取目标页内容,解析过去待抓取图片地址, 替换地址里的特殊字符,抓取并保存到本地
var fetchImgsByReplace = exports.fetchImgsByReplace = async function fetchImgsByReplace(url, selector, attr, directory) {
var res = await fetch(url)
var content = await res.text()
var imgs = parseImgsByReplace(content, selector, attr)
for (let img of imgs) {
util.log("img:%s", img)
await saveImg(img, directory)
}
}
//获取目标页内容,解析过去待抓取图片地址,抓取并保存到本地
var fetchImgs = exports.fetchImgs = async function fetchImgs(url, selector, attr, directory) {
var res = await fetch(url)
var content = await res.text()
var imgs = parseImgs(content, selector, attr)
for (let img of imgs) {
util.log("img:%s", img)
if (!img.startsWith("http://")) {
var domain = url.split("/")
util.log(util.format("%s%s/%s", "http://", domain[2], img))
await saveImg(util.format("%s%s/%s", "http://", domain[2], img), directory)
} else {
await saveImg(img, directory)
}
}
}
//从入口页面获取目标页面链接
var fetchLinks = exports.fetchLinks = async function fetchLinks(url, selector, attr) {
var res = await fetch(url)
var content = await res.text()
var $ = cheerio.load(content)
var links = []
$(selector).each(function (index, link) {
links.push($(link).attr(attr))
})
return links
}
//使用基于electron的Nightmare渲染页面JS并滚动加载全部页面内容
var scrollByNm = exports.scrollByNm = async function scrollByNm(url, cateSelector) {
var nightmare = Nightmare({ show: true })
await nightmare
.viewport(1600, 900)
.useragent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36")
.goto(url)
.click(cateSelector)
.wait(5000)
//处理scroll自动加载
var previousHeight, currentHeight = 0
while (previousHeight !== currentHeight) {
previousHeight = currentHeight
var currentHeight = await nightmare.evaluate(function () {
return document.body.scrollHeight
})
await nightmare.scrollTo(currentHeight, 0)
.wait(3000)
}
// while (loop) {
// previousHeight = currentHeight
// var currentHeight = await nightmare.evaluate(function () {
// return document.body.scrollHeight
// })
// await nightmare.scrollTo(currentHeight, 0)
// .wait(3000)
// loop = false
// }
// return
}
//使用nightmare渲染页面并依据CSS选择器点击超链接
var linksByNm = exports.linksByNm = async function linksByNm(linkSelector, attr) {
var links = await nightmare
.evaluate(function (linkSelector, attr) {
var li = document.querySelectorAll(linkSelector)
var links = [].slice.call(li).map(function (link) {
return {
url: link.getAttribute(attr)
}
})
return links
}, linkSelector, attr)
await nightmare.end()
return links
}
//使用fs同步创建目录
var mkdir = exports.mkdir = async function mkdir(saveTo) {
if (!fs.existsSync(saveTo)) {
await fs.mkdirSync(saveTo)
}
}
//定义图片上传后的相关属性的model
var album = sequelize.define('album', {
id: {
type: Sequelize.BIGINT,
field: 'id',
autoIncrement: true,
primaryKey: true //定义为主键
},
cateID: {
type: Sequelize.STRING,
field: 'cateid'
},
cateName: {
type: Sequelize.STRING,
field: 'cate_name'
},
albumID: {
type: Sequelize.STRING,
field: 'albumid'
},
albumName: {
type: Sequelize.STRING,
field: 'album_name'
},
upyun: {
type: Sequelize.STRING,
field: 'upyun'
},
width: {
type: Sequelize.STRING,
field: 'width'
},
height: {
type: Sequelize.STRING,
field: 'height'
}
}, {
// tableName:'tbl_author',
timestamps: true,
//sequelize会在模型定义的时候自动添加createdAt和updatedAt属性,这样我们可以知道每行记录的变更时间
//如果不想要这个特性,可以把timestamps置为false
// timestamps: false,
//不要删除记录,而是添加一个deletedAt属性,只有在timestamps启用的情况下才有效。
paranoid: true,
//不对自动添加的属性使用驼峰命名法,而是用下划线风格。
//因此updatedAt变成udpated_at
underscored: true,
//禁止修改表名,sequelize默认会把所有模型的名字(define方法的第一个参数)转成复数
//不需要的话就设成true,这样模型名就是表名
// freezeTableName: true,
//自定义表名
//tableName: 'my_very_custom_table_name'
})
//上传到upyun并在数据库记录图片的相关属性
//使用async.queue控制上传任务的并发数量
//使用async.waterfall控制闭环任务的同步执行
var upyunUploadAndRecord = exports.upyunUploadAndRecord = async function upyunUploadAndRecord(bucket, username, passwd, secret, upyunRoot, serviceURI, localRoot) {
var upyun = new UpYun(bucket, username, passwd, 'v0.api.upyun.com', {
apiVersion: 'v2',
secret: secret
})
await sequelize.sync()
.then(function () {
util.log('synced')
})
var files = await rd.readFileSync(localRoot)
// var data = [];
var stime = new Date()
var taskResult = { failed: 0, success: 0 }
var q = async.queue(function (task, callback) {
var f = task.topicFile
// console.time('waterfall');
var _stime = new Date()
async.waterfall([
function (done) {
util.log('file: %s', f);
var attStr = f.match(/\/imgs\/\w+\/\w+\//g)
util.log(attStr)
var attArray = String(attStr).split("/")
let cateID = attArray[2].split('')
.map(function (char) {
return char.charCodeAt(0);
})
.reduce(function (current, previous) {
return previous + current;
});
let albumID = attArray[3].split('')
.map(function (char) {
return char.charCodeAt(0);
})
.reduce(function (current, previous) {
return previous + current;
});
attArray.push(cateID, albumID)
util.log(attArray)
util.log("cateID:%s, cateName:%s, albumID:%s, albumName:%s", attArray[5], attArray[2], attArray[6], attArray[3])
done(null, attArray)
},
function (attArray, done) {
util.log(attArray)
//upload upyun
var x = util.format("%s/%d/%d/%d/%d-%s%s", upyunRoot, moment().format("YYYY"), moment().format("MM"), moment().format("DD"),
moment().format("x"), Math.random().toString(36).substr(2, 8), path.extname(f))
util.log("upyun:%s%s", serviceURI, x)
try {
upyun.putFile(x, f, null, true, null, function (err, response) {
if (err) {
util.log("upyun putfile error:%s", err)
throw (err)
++taskResult.failed
} else {
util.log(response)
//sql入库
if (response.statusCode == 200) {
album.create(JSON.parse(util.format('{"cateID":"%s", "cateName":"%s", "albumID":"%s", "albumName":"%s", "upyun":"%s%s", "width":"%s", "height":"%s"}', attArray[5], attArray[2], attArray[6], attArray[3], serviceURI, x, response.headers['x-upyun-width'], response.headers['x-upyun-height'])))
.then(function (r) {
//util.log(result)
util.log("album create success")
++taskResult.success
try {
fs.unlinkSync(f)
util.log("the process is time consuming %s ms", new Date() - _stime)
util.log("delete %s successed", f)
} catch (error) {
util.log("delete %s failed %s", f, error)
++taskResult.failed
}
}).catch(function (r) {
util.log("album create failed %s", r)
throw (r)
++taskResult.failed
})
callback(null)
// callback(data.push(result));
}
}
// response.end()
})
} catch (error) {
util.log("upyun file or insert db failed %s", error)
}
done(null, 'upyun')
},
], function (err, result) {
util.log("upyun:%s", result)
// console.timeEnd('waterfall');
})
}, 10)
/**
* 监听:当所有任务都执行完以后,将调用该函数
*/
q.drain = function () {
util.log('all tasks have been processed');
util.log('all tasks start time %s end time %s cost time %d ms', moment(stime).format(), moment().format(), (new Date() - stime))
util.log(taskResult);
};
/**
* 添加任务
*/
files.forEach(function (topicFile) {
if ((path.extname(topicFile) == ".jpg") || (path.extname(topicFile) == ".gif") || (path.extname(topicFile) == ".jpeg")) {
util.log("another:%s", topicFile)
q.push({ topicFile: topicFile }, function (err) {
util.log('push finished ' + topicFile);
});
}
});
}
//针对mm-eastday.com图片站写的入口
var my = exports.my = async function my(url, selector, folderName, attr, cateSelector, linkSelector, imgAttr, directory) {
await mkdir(directory)
await scrollByNm(url, cateSelector)
util.log("Done")
// 获取入口页的专辑列表
var links = await linksByNm(linkSelector, attr)
for (let url of links) {
var saveTo = path.join(directory, String(Math.random().toString(36).substr(2, 8)))
await mkdir(saveTo)
await fetchImgs(url.url, selector, imgAttr, saveTo)
}
}
1
https://gitee.com/gb-dev/spiderPic.git
git@gitee.com:gb-dev/spiderPic.git
gb-dev
spiderPic
spiderPic
dev

搜索帮助