diff --git a/.travis.yml b/.travis.yml index a9f233f37f99ae2dcd5aa2cfefe18738158dd470..9e6f78d38cb18e09fab941605d511519d6fea323 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - oraclejdk7 + - openjdk7 diff --git a/README-zh.md b/README-zh.md index e8f07355168882959769d0dd375807cbade48a9b..cd1b090c73dc42fa6f676cb5fdddb70df04487b2 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,4 +1,4 @@ -![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg) +![logo](http://webmagic.io/images/logo.jpeg) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) @@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.6.1 + 0.7.3 us.codecraft webmagic-extension - 0.6.1 + 0.7.3 ``` @@ -161,7 +161,7 @@ public class OschinaBlog { webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。 -webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://my.oschina.net/oscfox/blog/194507) +webmagic的使用可以参考:[oschina openapi 应用:博客搬家](https://git.oschina.net/yashin/MoveBlog) ### 协议 @@ -178,7 +178,7 @@ QQ: ### QQ群: -373225642 +373225642(已满) 542327088 ### 相关项目: diff --git a/README.md b/README.md index 8785844332fe22edaae2ee77c52d3e90bf2d7045..73cb48833bf10506414b63a31d24efff00626c46 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg) +![logo](http://webmagic.io/images/logo.jpeg) [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) @@ -23,12 +23,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.6.1 + 0.7.3 us.codecraft webmagic-extension - 0.6.1 + 0.7.3 ``` @@ -142,7 +142,7 @@ To write webmagic, I refered to the projects below : [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) -QQ Group: 373225642 +QQ Group: 373225642 542327088 ### Related Project diff --git a/assets/data.plist b/assets/data.plist deleted file mode 100644 index 5c8fa3adf557dfc18b96e88c61ebcd26243a3382..0000000000000000000000000000000000000000 --- a/assets/data.plist +++ /dev/null @@ -1,1525 +0,0 @@ - - - - - ApplicationVersion - - com.omnigroup.OmniGrafflePro - 139.16.0.171715 - - CreationDate - 2014-03-12 08:47:15 +0000 - Creator - 黄 亿华 - GraphDocumentVersion - 8 - GuidesLocked - NO - GuidesVisible - YES - ImageCounter - 2 - ImageLinkBack - - - - ImageList - - image1.pdf - - LinksVisible - NO - MagnetsVisible - NO - MasterSheets - - ModificationDate - 2014-03-12 12:19:49 +0000 - Modifier - 黄 亿华 - NotesVisible - NO - OriginVisible - NO - PageBreaks - YES - PrintInfo - - NSBottomMargin - - float - 41 - - NSHorizonalPagination - - coded - BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG - - NSLeftMargin - - float - 18 - - NSPaperSize - - size - {595, 842} - - NSPrintReverseOrientation - - int - 0 - - NSRightMargin - - float - 18 - - NSTopMargin - - float - 18 - - - ReadOnly - NO - Sheets - - - ActiveLayerIndex - 0 - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {559, 783}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - DisplayScale - 1 0/72 in = 1.0000 in - GraphicsList - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 47 - Points - - {280.41377887789179, 462.5} - {280.41377887789179, 115.5} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 25 - - - - Bounds - {{146.91379269521701, 588}, {66, 22}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica - Size - 18 - - ID - 46 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Manage} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{146.41379269521701, 139}, {37, 22}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica - Size - 18 - - ID - 45 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 URL} - VerticalPad - 0 - - Wrap - NO - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 16 - - ID - 44 - Points - - {372.15749563673154, 519.63519787392613} - {299.36323356646488, 641.57068447906465} - - Style - - stroke - - HeadArrow - 0 - Legacy - - TailArrow - FilledArrow - - - Tail - - ID - 19 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 16 - - ID - 43 - Points - - {278.95058213917224, 553.49998251301065} - {278.21479589361269, 641.50001748698935} - - Style - - stroke - - HeadArrow - 0 - Legacy - - TailArrow - FilledArrow - - - Tail - - ID - 18 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 16 - - ID - 42 - Points - - {183.67008975370248, 519.63519787392613} - {256.46435182396914, 641.57068447906465} - - Style - - stroke - - HeadArrow - 0 - Legacy - - TailArrow - FilledArrow - - - Tail - - ID - 17 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 38 - Points - - {444.39191518528105, 474.53820418060883} - {295.34133058251143, 115.4617958193911} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 28 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 37 - Points - - {436.53677827701807, 306.58797902807424} - {305.10330711341595, 115.41202097192573} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 26 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 34 - Points - - {112.13640445647307, 462.54223298101982} - {264.62814058011099, 115.45782532539049} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 22 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 33 - Points - - {119.85283790129353, 306.5917371670962} - {255.0255287391405, 115.40826283290379} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 20 - - - - Bounds - {{422.41379269521701, 411}, {60, 22}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica - Size - 18 - - ID - 32 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 \'85\'85\'85.} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{247.91379269521701, 411}, {60, 22}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica - Size - 18 - - ID - 31 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 \'85\'85\'85.} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{65.913792695217012, 411}, {60, 22}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica - Size - 18 - - ID - 30 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 \'85\'85\'85.} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{392.41379269521701, 475}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 28 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{392.41379269521701, 307}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 26 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{216.41379269521701, 463}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 25 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{216.41379269521701, 307}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 23 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{35.413792695217012, 463}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 22 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{35.413792695217012, 307}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 20 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{202.41379269521701, 642}, {151, 71}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 16 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Admin} - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 15 - Points - - {428.35251803234684, 231.64153003420739} - {315.28169719234131, 115.35846996579261} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 8 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 18 - - Head - - ID - 3 - - ID - 13 - Points - - {128.26734609568192, 231.6464465995351} - {244.56023929475211, 115.35355340046489} - - Style - - stroke - - HeadArrow - FilledArrow - Legacy - - TailArrow - 0 - - - Tail - - ID - 7 - - - - Bounds - {{392.41379269521701, 232}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 8 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{35.413792695217012, 232}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 7 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{216.41379269521701, 232}, {128, 57}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 5 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs36 \cf0 Spider} - - - - Bounds - {{200.41379269521701, 44}, {160, 71}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 3 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Scheduler} - - - - Bounds - {{15.413792695217012, 204}, {168, 349}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 17 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Worker} - - - - Bounds - {{372.41379269521701, 204}, {168, 349}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 19 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Worker} - - - - Bounds - {{200.41379269521701, 204}, {160, 349}} - Class - ShapedGraphic - FontInfo - - Font - Helvetica - Size - 18 - - ID - 18 - Shape - Rectangle - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Worker} - - - - GridInfo - - HPages - 1 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - Orientation - 2 - PrintOnePage - - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 1 - UniqueID - 1 - VPages - 1 - - - ActiveLayerIndex - 0 - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {559, 783}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - DisplayScale - 1 0/72 in = 1.0000 in - GraphicsList - - - Bounds - {{278, 395}, {172, 104}} - Class - ShapedGraphic - ID - 52 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Http API} - - - - Bounds - {{113, 395}, {172, 104}} - Class - ShapedGraphic - ID - 51 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 WebMagic} - - - - Bounds - {{278, 499}, {172, 104}} - Class - ShapedGraphic - ID - 50 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Redis} - - - - Bounds - {{113, 499}, {172, 104}} - Class - ShapedGraphic - ID - 49 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs48 \cf0 Mysql} - - - - GridInfo - - HPages - 1 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - Orientation - 2 - PrintOnePage - - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 2 - UniqueID - 2 - VPages - 1 - - - SmartAlignmentGuidesActive - YES - SmartDistanceGuidesActive - YES - UseEntirePage - - WindowInfo - - CurrentSheet - 0 - ExpandedCanvases - - Frame - {{373, 90}, {693, 788}} - ListView - - OutlineWidth - 142 - RightSidebar - - ShowRuler - - Sidebar - - SidebarWidth - 120 - VisibleRegion - {{-106, -56}, {769.65514710343643, 895.17238435507204}} - Zoom - 0.72500002384185791 - ZoomValues - - - 版面 1 - 0.72500002384185791 - 1.4500000476837158 - - - 版面 2 - 1 - 0.5 - - - - - diff --git a/assets/image1.pdf b/assets/image1.pdf deleted file mode 100644 index 79fff308c863194379c2b05fa26aecbaeca4a0f4..0000000000000000000000000000000000000000 Binary files a/assets/image1.pdf and /dev/null differ diff --git a/assets/logo-simple.jpg b/assets/logo-simple.jpg deleted file mode 100644 index 366aa6276185d8b1c946aae4c3e453fdc377e1b9..0000000000000000000000000000000000000000 Binary files a/assets/logo-simple.jpg and /dev/null differ diff --git a/assets/logo.graffle b/assets/logo.graffle deleted file mode 100644 index 84bbe20b50ccfb49748687b6245a825c9b9ce682..0000000000000000000000000000000000000000 --- a/assets/logo.graffle +++ /dev/null @@ -1,351 +0,0 @@ - - - - - ActiveLayerIndex - 0 - ApplicationVersion - - com.omnigroup.OmniGrafflePro - 139.16.0.171715 - - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {48, 48}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - CanvasSize - {48, 48} - ColumnAlign - 1 - ColumnSpacing - 36 - CreationDate - 2013-11-10 06:17:01 +0000 - Creator - 黄 亿华 - DisplayScale - 1 pt = 1 pt - GraphDocumentVersion - 8 - GraphicsList - - - Bounds - {{7.5, 24}, {23, 15}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - ID - 45 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs16 \cf0 Magi -\fs24 c} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{18, 13}, {19.359630584716797, 18}} - Class - ShapedGraphic - FitText - Vertical - Flow - Resize - FontInfo - - Color - - w - 0 - - Font - STHeitiSC-Light - Size - 6 - - ID - 39 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fmodern\fcharset0 Courier-Oblique;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\fs14 \cf0 eb} - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - ID - 31 - Points - - {6, 11} - {15, 27} - {14, 8} - {21, 26} - {22, 6} - {22, 6} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - - - - - GridInfo - - GridSpacing - 1 - ShowsGrid - YES - SnapsToGrid - YES - - GuidesLocked - NO - GuidesVisible - YES - HPages - 1 - ImageCounter - 2 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - LinksVisible - NO - MagnetsVisible - NO - MasterSheets - - ModificationDate - 2013-11-10 06:51:47 +0000 - Modifier - 黄 亿华 - NotesVisible - NO - Orientation - 2 - OriginVisible - NO - PageBreaks - YES - PrintInfo - - NSBottomMargin - - float - 41 - - NSHorizonalPagination - - coded - BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG - - NSLeftMargin - - float - 18 - - NSPaperSize - - size - {594.99997329711914, 842} - - NSPrintReverseOrientation - - int - 0 - - NSRightMargin - - float - 18 - - NSTopMargin - - float - 18 - - - PrintOnePage - - ReadOnly - NO - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 1 - SmartAlignmentGuidesActive - NO - SmartDistanceGuidesActive - NO - UniqueID - 1 - UseEntirePage - - VPages - 1 - WindowInfo - - CurrentSheet - 0 - ExpandedCanvases - - Frame - {{491, 381}, {498, 477}} - ListView - - OutlineWidth - 142 - RightSidebar - - Sidebar - - SidebarWidth - 116 - VisibleRegion - {{0.125, 0.125}, {47.75, 47.875}} - Zoom - 8 - ZoomValues - - - 版面 1 - 8 - 1 - - - - - diff --git a/assets/logo.jpg b/assets/logo.jpg deleted file mode 100644 index 356e25df0185c7461037b9dc15dc9d4a8566f476..0000000000000000000000000000000000000000 Binary files a/assets/logo.jpg and /dev/null differ diff --git a/assets/logo2.graffle/data.plist b/assets/logo2.graffle/data.plist deleted file mode 100644 index 54d64a42f36e601bc342f1916f6224715d4c6bc8..0000000000000000000000000000000000000000 --- a/assets/logo2.graffle/data.plist +++ /dev/null @@ -1,552 +0,0 @@ - - - - - ActiveLayerIndex - 0 - ApplicationVersion - - com.omnigroup.OmniGrafflePro - 139.16.0.171715 - - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {1117.9999465942383, 783}} - Class - SolidGraphic - FontInfo - - Font - Helvetica - Size - 37 - - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - CreationDate - 2013-11-10 06:51:58 +0000 - Creator - 黄 亿华 - DisplayScale - 1 0/72 in = 1 0/72 in - GraphDocumentVersion - 8 - GraphicsList - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - Head - - ID - 60 - Position - 0.40939974784851074 - - ID - 62 - Points - - {324, 109} - {339.36559006029825, 179.11528294284673} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - Width - 10 - - - Tail - - ID - 59 - Info - 4 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - Head - - ID - 60 - Position - 0.73653632402420044 - - ID - 61 - Points - - {269, 146} - {296, 194} - {309, 266} - {349, 265} - {348.96211936963607, 215.03741157007715} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - Width - 10 - - - Tail - - ID - 59 - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - ID - 60 - Points - - {371.89694213867188, 179} - {356.89694213867188, 162} - {335.89694213867188, 188} - {351.89694213867188, 217} - {371.89694213867188, 202} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - Width - 10 - - - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - ID - 59 - Points - - {269, 146} - {295, 189} - {300, 110} - {310, 178} - {324, 109} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - Width - 10 - - - - - Bounds - {{335.89695436197019, 119}, {41, 43}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - ID - 47 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs72 \cf0 eb} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{164, 154}, {236.89692325714185, 98.181818181818088}} - Class - ShapedGraphic - ID - 45 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\fs96 \cf0 Magi} - VerticalPad - 0 - - Wrap - NO - - - Class - LineGraphic - FontInfo - - Font - Helvetica - Size - 13 - - ID - 31 - Points - - {50.404270172119141, 72.000000000000256} - {115.40427017211914, 154.00000000000028} - {103.80320000069037, 26.090909090909292} - {124.95447158813477, 97} - {175.90226360069005, 143.90909090909116} - {186.20212982926148, 13} - {186.20212982926148, 13} - - Style - - stroke - - HeadArrow - 0 - Legacy - - LineType - 1 - TailArrow - 0 - Width - 10 - - - - - Bounds - {{406.79786682128906, 136.09091186523438}, {165, 160}} - Class - ShapedGraphic - ID - 46 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - - GridInfo - - GuidesLocked - NO - GuidesVisible - YES - HPages - 2 - ImageCounter - 2 - ImageLinkBack - - - - ImageList - - image1.tiff - - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - LinksVisible - NO - MagnetsVisible - NO - MasterSheets - - ModificationDate - 2013-11-10 07:00:00 +0000 - Modifier - 黄 亿华 - NotesVisible - NO - Orientation - 2 - OriginVisible - NO - PageBreaks - YES - PrintInfo - - NSBottomMargin - - float - 41 - - NSHorizonalPagination - - coded - BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG - - NSLeftMargin - - float - 18 - - NSPaperSize - - size - {594.99997329711914, 842} - - NSPrintReverseOrientation - - int - 0 - - NSRightMargin - - float - 18 - - NSTopMargin - - float - 18 - - - PrintOnePage - - ReadOnly - NO - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 1 - SmartAlignmentGuidesActive - YES - SmartDistanceGuidesActive - YES - UniqueID - 1 - UseEntirePage - - VPages - 1 - WindowInfo - - CurrentSheet - 0 - ExpandedCanvases - - Frame - {{350, -208}, {693, 795}} - ListView - - OutlineWidth - 142 - RightSidebar - - ShowRuler - - Sidebar - - SidebarWidth - 120 - VisibleRegion - {{23, 0}, {558, 656}} - Zoom - 1 - ZoomValues - - - 版面 1 - 1 - 1 - - - - - diff --git a/assets/logo2.graffle/image1.tiff b/assets/logo2.graffle/image1.tiff deleted file mode 100644 index 42bff86e55fec780c1b1eeec32b8c0e9f284ec2a..0000000000000000000000000000000000000000 Binary files a/assets/logo2.graffle/image1.tiff and /dev/null differ diff --git a/assets/logo3.graffle/data.plist b/assets/logo3.graffle/data.plist deleted file mode 100644 index 07fdd02cf9e43c8b9ac507df455dff1fa67be34b..0000000000000000000000000000000000000000 --- a/assets/logo3.graffle/data.plist +++ /dev/null @@ -1,840 +0,0 @@ - - - - - ApplicationVersion - - com.omnigroup.OmniGrafflePro - 139.16.0.171715 - - CreationDate - 2013-11-10 07:01:04 +0000 - Creator - 黄 亿华 - GraphDocumentVersion - 8 - GuidesLocked - NO - GuidesVisible - YES - ImageCounter - 6 - ImageLinkBack - - - - - - - ImageList - - image5.tiff - image4.tiff - image2.tiff - image1.tiff - - LinksVisible - NO - MagnetsVisible - NO - MasterSheets - - ModificationDate - 2013-11-10 08:09:16 +0000 - Modifier - 黄 亿华 - NotesVisible - NO - OriginVisible - NO - PageBreaks - YES - PrintInfo - - NSBottomMargin - - float - 41 - - NSHorizonalPagination - - coded - BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG - - NSLeftMargin - - float - 18 - - NSPaperSize - - size - {594.99997329711914, 842} - - NSPrintReverseOrientation - - int - 0 - - NSRightMargin - - float - 18 - - NSTopMargin - - float - 18 - - - ReadOnly - NO - Sheets - - - ActiveLayerIndex - 0 - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {558.99997329711914, 783}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - DisplayScale - 1 0/72 in = 1 0/72 in - GraphicsList - - - Bounds - {{390, 391.5}, {114, 90}} - Class - ShapedGraphic - ID - 7 - ImageID - 2 - Shape - Rectangle - Style - - fill - - FillType - 2 - GradientAngle - 90 - GradientColor - - w - 0.666667 - - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - - Bounds - {{3, 265}, {181, 114}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - LucidaSans-DemiItalic - Size - 96 - - ID - 6 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\b\fs192 \cf1 M } - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{168, 314}, {77, 58}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - LucidaSans-DemiItalic - Size - 48 - - ID - 5 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;} -{\colortbl;\red255\green255\blue255;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\b\fs96 \cf2 agi} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{356, 201}, {86, 86}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - LucidaBright-DemiItalic - Size - 72 - - ID - 4 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaBright-Demi;} -{\colortbl;\red255\green255\blue255;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\b\fs144 \cf2 eb} - VerticalPad - 0 - - Wrap - NO - - - Bounds - {{43, 114}, {395, 400}} - Class - ShapedGraphic - FitText - Clip - Flow - Clip - HFlip - YES - ID - 3 - ImageID - 1 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - - Bounds - {{-4, 114}, {535, 400}} - Class - ShapedGraphic - ID - 1 - Shape - Rectangle - Style - - fill - - Color - - b - 0 - g - 0 - r - 0 - - - shadow - - Draws - NO - - - - - GridInfo - - HPages - 1 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - Orientation - 2 - PrintOnePage - - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 1 - UniqueID - 1 - VPages - 1 - - - ActiveLayerIndex - 0 - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {558.99997329711914, 783}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - DisplayScale - 1 0/72 in = 1.0000 in - GraphicsList - - - Bounds - {{232, 432}, {84, 93}} - Class - ShapedGraphic - ID - 10 - ImageID - 4 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - - Bounds - {{16, 421}, {500, 115}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica-Bold - Size - 96 - - ID - 8 - Shape - Rectangle - Style - - fill - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;\red0\green0\blue0;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\b\fs192 \cf2 Web agic} - VerticalPad - 0 - - Wrap - NO - - - GridInfo - - HPages - 1 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - Orientation - 2 - PrintOnePage - - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 2 - UniqueID - 2 - VPages - 1 - - - ActiveLayerIndex - 0 - AutoAdjust - - BackgroundGraphic - - Bounds - {{0, 0}, {1117.9999465942383, 783}} - Class - SolidGraphic - ID - 2 - Style - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - BaseZoom - 0 - CanvasOrigin - {0, 0} - ColumnAlign - 1 - ColumnSpacing - 36 - DisplayScale - 1 0/72 in = 1.0000 in - GraphicsList - - - Bounds - {{9, 277.5}, {114, 114}} - Class - ShapedGraphic - ID - 11 - ImageID - 5 - Shape - Rectangle - Style - - fill - - Draws - NO - - shadow - - Draws - NO - - stroke - - Draws - NO - - - - - Bounds - {{100, 294}, {474, 115}} - Class - ShapedGraphic - FitText - YES - Flow - Resize - FontInfo - - Font - Helvetica-Bold - Size - 96 - - ID - 8 - Shape - Rectangle - Style - - fill - - Draws - NO - - stroke - - Draws - NO - - - Text - - Pad - 0 - Text - {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400 -\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc - -\f0\i\b\fs192 \cf0 WebMagic} - VerticalPad - 0 - - Wrap - NO - - - GridInfo - - HPages - 2 - KeepToScale - - Layers - - - Lock - NO - Name - 图层 1 - Print - YES - View - YES - - - LayoutInfo - - Animate - NO - circoMinDist - 18 - circoSeparation - 0.0 - layoutEngine - dot - neatoSeparation - 0.0 - twopiSeparation - 0.0 - - Orientation - 2 - PrintOnePage - - RowAlign - 1 - RowSpacing - 36 - SheetTitle - 版面 3 - UniqueID - 3 - VPages - 1 - - - SmartAlignmentGuidesActive - YES - SmartDistanceGuidesActive - YES - UseEntirePage - - WindowInfo - - CurrentSheet - 2 - ExpandedCanvases - - Frame - {{174, 77}, {771, 795}} - ListView - - OutlineWidth - 142 - RightSidebar - - ShowRuler - - Sidebar - - SidebarWidth - 120 - VisibleRegion - {{0, 0}, {636, 656}} - Zoom - 1 - ZoomValues - - - 版面 1 - 1 - 1 - - - 版面 2 - 1 - 1 - - - 版面 3 - 1 - 1 - - - - - diff --git a/assets/logo3.graffle/image1.tiff b/assets/logo3.graffle/image1.tiff deleted file mode 100644 index 7d50474729e30e0fa30209b2b66cf5d5ee5ce7dc..0000000000000000000000000000000000000000 Binary files a/assets/logo3.graffle/image1.tiff and /dev/null differ diff --git a/assets/logo3.graffle/image2.tiff b/assets/logo3.graffle/image2.tiff deleted file mode 100644 index 606ae8dfcfa0e2eb843bad49f2d7c36832a0c3d9..0000000000000000000000000000000000000000 Binary files a/assets/logo3.graffle/image2.tiff and /dev/null differ diff --git a/assets/logo3.graffle/image4.tiff b/assets/logo3.graffle/image4.tiff deleted file mode 100644 index 0f674bf9628bf498431c5872703df59c7e17a6cf..0000000000000000000000000000000000000000 Binary files a/assets/logo3.graffle/image4.tiff and /dev/null differ diff --git a/assets/logo3.graffle/image5.tiff b/assets/logo3.graffle/image5.tiff deleted file mode 100644 index 2de8dfc47ed1ea521a6bba846569d592bd6a0a62..0000000000000000000000000000000000000000 Binary files a/assets/logo3.graffle/image5.tiff and /dev/null differ diff --git a/assets/logo3.png b/assets/logo3.png deleted file mode 100644 index bf4d7511b697a4748326c8841c4dff07b72e92c3..0000000000000000000000000000000000000000 Binary files a/assets/logo3.png and /dev/null differ diff --git a/assets/logo4.png b/assets/logo4.png deleted file mode 100644 index ba2337f7f93058d39a82b650ab94bf23fa0fc1e7..0000000000000000000000000000000000000000 Binary files a/assets/logo4.png and /dev/null differ diff --git a/assets/page-extract-rule.bmml b/assets/page-extract-rule.bmml deleted file mode 100644 index fec8d3ec84357157eec3ddd226636020b0a66852..0000000000000000000000000000000000000000 --- a/assets/page-extract-rule.bmml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - A%20Web%20Page%0Ahttp%3A// - - - - \ No newline at end of file diff --git a/assets/webmagic-create-spider.bmml b/assets/webmagic-create-spider.bmml deleted file mode 100644 index 761704291613e3013950bfd57734ae1db2cd17c1..0000000000000000000000000000000000000000 --- a/assets/webmagic-create-spider.bmml +++ /dev/null @@ -1,440 +0,0 @@ - - - - - Create%20Spider%0Ahttp%3A//localhost%3A8080/spider/create - - - - - - - true - Custom%20PageProcessor%20 - - - - - - - true - BlogSpider - - - - - true - SpiderTemplate - - - - - true - New%20Template - - - - - true - Title - - - - - true - //title/text%28%29 - - - - - true - Content - - - - - true - //div%5B@class%3D%27BlogContent%27%5D/text%28%29 - - - - - true - Date - - - - - true - //div%5B@class%3D%27BlogStat%27%5D/regex%28%27%5Cd+-%5Cd+-%5Cd+%5Cs+%5Cd+%3A%5Cd+%27%29 - - - - - true - Tags - - - - - true - //div%5B@class%3D%27tags%27%5D/a/text%28%29 - - - - - - - - - - - Create%20Spider - - - - - Name - - - - - blog.oschina.net - - - - - StartUrls - - - - - http%3A//my.oschina.net/flashsword/blog/180623 - - - - - Other%20Source - - - - - up - Advanced%20Setting - - - - - - - - - URL%20manangement - - - - - Scheduler - - - - - Host - - - - - Redis - - - - - 127.0.0.1 - - - - - 6379 - - - - - - - true - New%20Scheduler - - - - - - - Persistent - - - - - Pipeline - - - - - Path - - - - - Local%20File - - - - - /data/webmaigc/%7BspdierName%7D - - - - - - - true - New%20Pipeline - - - - - Create - - - - - Cancel - - - - - - - - - Advanced%20Setting - - - - - - - Headers - - - - - true - User%20Agent - - - - - true - Cookie - - - - - true - Mozilla/5.0%20%28compatible%3B%20MSIE%2010.0... - - - - - true - id - - - - - Add - - - - - true - name - - - - - true - value - - - - - Add - - - - - Add - - - - - true - 123456 - - - - - - - - - true - Proxy - - - - - true - 127.0.0.1 - - - - - true - 8080 - - - - - true - username - - - - - true - password - - - - - - - - - true - Charset - - - - - true - utf-8 - - - - - AutoDetect - - - - - - - - - true - Frenquecny - - - - - true - 3000 - - - - - true - Sleep - - - - - true - milliseconds%20after%20download%20one%20page - - - - - - - - - - - Error%20Handle - - - - - - - true - Retry - - - - - true - 3 - - - - - true - Retry - - - - - true - times%20when%20downloading%20a%20page - - - - - true - If%20it%20still%20fails%20in%20downloading%2C%20re-insert%20it%20to%20url%20queue.%5Cr%5CrAfter%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20times%2C%20the%20url%20will%20be%20discarded. - - - - - true - 3 - - - - - - - - - - - Test - - - - - import - - - - - export - - - - \ No newline at end of file diff --git a/assets/webmagic-create-spider.png b/assets/webmagic-create-spider.png deleted file mode 100644 index 8fe92c47018e22114df68581510f92152612631c..0000000000000000000000000000000000000000 Binary files a/assets/webmagic-create-spider.png and /dev/null differ diff --git a/assets/webmagic-spider-manage.bmml b/assets/webmagic-spider-manage.bmml deleted file mode 100644 index 1423b01f8fb8f31a76d471abf670a706f1d2e7ad..0000000000000000000000000000000000000000 --- a/assets/webmagic-spider-manage.bmml +++ /dev/null @@ -1,110 +0,0 @@ - - - - - Spider%20List%20Page%0Ahttp%3A//localhost%3A8080/spider/list - - - - - Spider%20%2C%20Add%20Time%20%5Ev%2CPages%20Total%20%5Ev%2C%20Pages%20Downloaded%20%5Ev%2C%20Error%20%5Ev%2C%20%20Operation%0Agithub.com%2C%202014-3-1.12%3A20%3A10%2C1221%2C%20595%2C%204%2C%20Stop%20Edit%20Delete%0Aoschina.net%2C2014-2-12.16%3A10%3A20%2C120%2C%20%20120%2C%200%2C%20Start%20Edit%20Delete%0Aappstore.com%2C2014-2-10.9%3A20%3A10%2C100000%2C100000%2C%200%2CStart%20Edit%20Delete - - - - - Works - - - - - - - selected - 10.1.2.1 - - - - - 10.1.2.2 - - - - - selected - 10.1.2.3 - - - - - 10.1.2.4 - - - - - all - - - - - - - - - - Real%20Time - - - - - Pages - - - - - Time - - - - - - - Keyword - - - - - Search - - - - - Spiders - - - - - Charts - - - - - 2014-2-1 - - - - - 2014-3-1 - - - - - Time%20from - - - - - to - - - - \ No newline at end of file diff --git a/assets/webmagic-spider-manage.png b/assets/webmagic-spider-manage.png deleted file mode 100644 index 8fbdb6a9e8bd9b49f6498bb5e52a1dc1b643fa9a..0000000000000000000000000000000000000000 Binary files a/assets/webmagic-spider-manage.png and /dev/null differ diff --git a/assets/webmagic.psd b/assets/webmagic.psd deleted file mode 100644 index 5f8fd3b72ef04b07fe7eefacd51a0cd55a0da7f5..0000000000000000000000000000000000000000 Binary files a/assets/webmagic.psd and /dev/null differ diff --git a/pom.xml b/pom.xml index 4279ec71c08e4a92f56c9f1f5eb10358a156e45a..2b2384fd827a7b2de547ae81e95df0ba901db2e4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 pom @@ -75,6 +75,11 @@ httpclient 4.5.2 + + org.apache.httpcomponents + httpcore + 4.4.4 + com.google.guava guava @@ -83,7 +88,7 @@ com.jayway.jsonpath json-path - 0.8.1 + 2.4.0 org.slf4j @@ -108,7 +113,7 @@ com.github.dreamhead moco-core - 0.9.1 + 0.11.0 test @@ -146,7 +151,7 @@ org.jsoup jsoup - 1.8.3 + 1.10.3 org.mockito @@ -164,8 +169,7 @@ maven-surefire-plugin 2.18 - pertest - -Xms1024m -Xmx1024m -Xss1m + 0 @@ -231,11 +235,20 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.9.1 + 2.10.4 UTF-8 + WebMagic 0.7.3 + en_US + + aggregate + + aggregate + + site + attach-javadocs @@ -289,7 +302,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.5 + 1.6 verify diff --git a/release-note.md b/release-note.md deleted file mode 100755 index f44704efd075006a4fc3935fb6607b158f3815b4..0000000000000000000000000000000000000000 --- a/release-note.md +++ /dev/null @@ -1,91 +0,0 @@ -Release Notes ----- -See latest versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases) - -*2012-9-4* `version:0.3.0` - -* Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup). - - [Xsoup](https://github.com/code4craft/xsoup) is an XPath selector based on Jsoup written by me. It has much better performance than HtmlCleaner. - - Time of processing a page is reduced from 7~9ms to 0.4ms. - - If Xsoup is not stable for your usage, just use `Spider.xsoupOff()` to turn off it and report an issue to me! - -* Add cycle retry times for Site. - - When cycle retry times is set, Spider will put the url which downloading failed back to scheduler, and retry after a cycle of queue. - -*2012-8-20* `version:0.2.1` - -ComboExtractor support for annotation. - -Request priority support (using `PriorityScheduler`). - -Complete some I18n work (comments and documents). - -More convenient extractor API: - -* Add attribute name select for CSSSelector. -* Group of regex selector can be specified. -* Add OrSelector. -* Add Selectors, import static Selectors.* for fluent API such as: - - or(regex("(.*)"), xpath("//title"), $("title")).select(s); -* Add JsonPathSelector for Json parse. - -*2012-8-9* `version:0.2.0` - -此次更新的主题是"方便"(之前的主题是"灵活")。 - -增加了webmagic-extension模块。 - -增加了注解方式支持,可以通过POJO+注解的方式编写一个爬虫,更符合Java开发习惯。以下是抓取一个博客的完整代码: - - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { - - @ExtractBy("//title") - private String title; - - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; - - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; - - public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), - new ConsolePageModelPipeline(), OschinaBlog.class) - .scheduler(new RedisScheduler("127.0.0.1")).thread(5).run(); - } - - } - -增加一个Spider.test(url)方法,用于开发爬虫时进行调试。 - -增加基于redis的分布式支持。 - -增加XPath2.0语法支持(webmagic-saxon模块)。 - -增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。 - -修复了不支持https的bug。 - -补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。 - -*2012-7-25* `version:0.1.0` - -第一个稳定版本。 - -修改了若干API,使得可扩展性更强,为每个任务分配一个ID,可以通过ID区分不同任务。 - -重写了Pipeline接口,将抽取结果集包装到ResultItems对象,而不是通用一个Page对象,便于逻辑分离。 - -增加下载的重试机制,支持gzip,支持自定义UA/cookie。 - -增加多线程抓取功能,只需在初始化的时候指定线程数即可。 - -增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。 - -完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。 \ No newline at end of file diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 7ca5c7b077b9fe98a26bd80a535fd35d88babd41..e889cd491b6daa97c94e08b7238e540c7a69cd02 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 diff --git a/webmagic-core/pom.xml.versionsBackup b/webmagic-core/pom.xml.versionsBackup deleted file mode 100644 index b530bab4198920a32bd87eb95282b3c76ddba7e9..0000000000000000000000000000000000000000 --- a/webmagic-core/pom.xml.versionsBackup +++ /dev/null @@ -1,86 +0,0 @@ - - - - us.codecraft - webmagic-parent - 0.5.2 - - 4.0.0 - - webmagic-core - - - - org.apache.httpcomponents - httpclient - - - - junit - junit - - - - com.google.guava - guava - - - - org.apache.commons - commons-lang3 - - - - us.codecraft - xsoup - - - - com.github.dreamhead - moco-core - - - - org.slf4j - slf4j-api - - - - org.slf4j - slf4j-log4j12 - - - - commons-collections - commons-collections - - - - org.assertj - assertj-core - - - - org.jsoup - jsoup - - - - org.apache.commons - commons-io - - - - com.jayway.jsonpath - json-path - 0.8.1 - - - - com.alibaba - fastjson - - - - - \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index f9495a4a358fc751600baad0b4359a1d585e4bda..c11df693c75e14ce659595dcdad9e2bd65d9b160 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,9 +4,11 @@ import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; +import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -41,15 +43,25 @@ public class Page { private Map> headers; - private int statusCode; + private int statusCode = HttpConstant.StatusCode.CODE_200; - private boolean needCycleRetry; + private boolean downloadSuccess = true; + + private byte[] bytes; private List targetRequests = new ArrayList(); + private String charset; + public Page() { } + public static Page fail(){ + Page page = new Page(); + page.setDownloadSuccess(false); + return page; + } + public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; @@ -73,7 +85,7 @@ public class Page { */ public Html getHtml() { if (html == null) { - html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl())); + html = new Html(rawText, request.getUrl()); } return html; } @@ -179,14 +191,6 @@ public class Page { return request; } - public boolean isNeedCycleRetry() { - return needCycleRetry; - } - - public void setNeedCycleRetry(boolean needCycleRetry) { - this.needCycleRetry = needCycleRetry; - } - public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); @@ -221,17 +225,45 @@ public class Page { this.headers = headers; } + public boolean isDownloadSuccess() { + return downloadSuccess; + } + + public void setDownloadSuccess(boolean downloadSuccess) { + this.downloadSuccess = downloadSuccess; + } + + public byte[] getBytes() { + return bytes; + } + + public void setBytes(byte[] bytes) { + this.bytes = bytes; + } + + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = charset; + } + @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + + ", html=" + html + + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + - ", needCycleRetry=" + needCycleRetry + + ", downloadSuccess=" + downloadSuccess + ", targetRequests=" + targetRequests + + ", charset='" + charset + '\'' + + ", bytes=" + Arrays.toString(bytes) + '}'; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 21cd72e6a8736416989e1d9ed3934e5b04fabe03..eefd91bb521fb15507856132a6897554da0f302a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; import java.io.Serializable; @@ -23,14 +24,19 @@ public class Request implements Serializable { private String method; + private HttpRequestBody requestBody; + /** * Store additional information in extras. */ private Map extras; + /** - * POST/GET param set - * */ - private Map params=new HashMap(); + * cookies for current url, if not set use Site's cookies + */ + private Map cookies = new HashMap(); + + private Map headers = new HashMap(); /** * Priority of the request.
@@ -39,6 +45,14 @@ public class Request implements Serializable { */ private long priority; + /** + * When it is set to TRUE, the downloader will not try to parse response body to text. + * + */ + private boolean binaryContent = false; + + private String charset; + public Request() { } @@ -87,12 +101,14 @@ public class Request implements Serializable { return extras; } - public void setExtras(Map extras) { + public Request setExtras(Map extras) { this.extras = extras; + return this; } - public void setUrl(String url) { + public Request setUrl(String url) { this.url = url; + return this; } /** @@ -105,31 +121,16 @@ public class Request implements Serializable { return method; } - public void setMethod(String method) { + public Request setMethod(String method) { this.method = method; + return this; } - public Map getParams() { - return params; - } - /** - * set params for request - *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' - * @param params params - * */ - public void setParams(Map params) { - this.params = params; - } - /** - * set params for request - *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic' - * @param key key - * @param value value - * */ - public void putParams(String key,String value) { - params.put(key,value); + @Override + public int hashCode() { + int result = url != null ? url.hashCode() : 0; + result = 31 * result + (method != null ? method.hashCode() : 0); + return result; } @Override @@ -140,16 +141,51 @@ public class Request implements Serializable { Request request = (Request) o; if (url != null ? !url.equals(request.url) : request.url != null) return false; - if (method != null ? !method.equals(request.method) : request.method != null) return false; - return params != null ? params.equals(request.params) : request.params == null; + return method != null ? method.equals(request.method) : request.method == null; } - @Override - public int hashCode() { - int result = url != null ? url.hashCode() : 0; - result = 31 * result + (method != null ? method.hashCode() : 0); - result = 31 * result + (params != null ? params.hashCode() : 0); - return result; + public Request addCookie(String name, String value) { + cookies.put(name, value); + return this; + } + + public Request addHeader(String name, String value) { + headers.put(name, value); + return this; + } + + public Map getCookies() { + return cookies; + } + + public Map getHeaders() { + return headers; + } + + public HttpRequestBody getRequestBody() { + return requestBody; + } + + public void setRequestBody(HttpRequestBody requestBody) { + this.requestBody = requestBody; + } + + public boolean isBinaryContent() { + return binaryContent; + } + + public Request setBinaryContent(boolean binaryContent) { + this.binaryContent = binaryContent; + return this; + } + + public String getCharset() { + return charset; + } + + public Request setCharset(String charset) { + this.charset = charset; + return this; } @Override @@ -158,8 +194,10 @@ public class Request implements Serializable { "url='" + url + '\'' + ", method='" + method + '\'' + ", extras=" + extras + - ", params=" + params + ", priority=" + priority + + ", headers=" + headers + + ", cookies="+ cookies+ '}'; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 520902db600219cf7d8f39f47f2c7258d03a5a6d..b6963ca43c7e4774da6577d9c46c230703eb33d2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.utils.HttpConstant; + import java.util.*; /** @@ -39,8 +41,10 @@ public class Site { private boolean useGzip = true; + private boolean disableCookieManagement = false; + static { - DEFAULT_STATUS_CODE_SET.add(200); + DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } /** @@ -236,7 +240,7 @@ public class Site { * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
* - * @param key key of http header, there are some keys constant in {@link HeaderConst} + * @param key key of http header, there are some keys constant in {@link HttpConstant.Header} * @param value value of header * @return this */ @@ -307,6 +311,22 @@ public class Site { return this; } + public boolean isDisableCookieManagement() { + return disableCookieManagement; + } + + /** + * Downloader is supposed to store response cookie. + * Disable it to ignore all cookie fields and stay clean. + * Warning: Set cookie will still NOT work if disableCookieManagement is true. + * @param disableCookieManagement disableCookieManagement + * @return this + */ + public Site setDisableCookieManagement(boolean disableCookieManagement) { + this.disableCookieManagement = disableCookieManagement; + return this; + } + public Task toTask() { return new Task() { @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5e785af0fef3b9e0e9da24686a0dc2fc1534e3f9..62c989f1d3479eea3ac636ba05acd0576fc21dad 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; @@ -302,7 +303,7 @@ public class Spider implements Runnable, Task { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider " + getUUID() + " started!"); + logger.info("Spider {} started!",getUUID()); while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { final Request request = scheduler.poll(this); if (request == null) { @@ -334,6 +335,7 @@ public class Spider implements Runnable, Task { if (destroyWhenExit) { close(); } + logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } protected void onError(Request request) { @@ -398,34 +400,59 @@ public class Spider implements Runnable, Task { } } - protected void processRequest(Request request) { + private void processRequest(Request request) { Page page = downloader.download(request, this); - if (page == null) { - sleep(site.getSleepTime()); - onError(request); - return; - } - // for cycle retry - if (page.isNeedCycleRetry()) { - extractAndAddRequests(page, true); - sleep(site.getRetrySleepTime()); - return; - } - pageProcessor.process(page); - extractAndAddRequests(page, spawnUrl); - if (!page.getResultItems().isSkip()) { - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); + if (page.isDownloadSuccess()){ + onDownloadSuccess(request, page); + } else { + onDownloaderFail(request); + } + } + + private void onDownloadSuccess(Request request, Page page) { + if (site.getAcceptStatCode().contains(page.getStatusCode())){ + pageProcessor.process(page); + extractAndAddRequests(page, spawnUrl); + if (!page.getResultItems().isSkip()) { + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } } + } else { + logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); + return; + } + + private void onDownloaderFail(Request request) { + if (site.getCycleRetryTimes() == 0) { + sleep(site.getSleepTime()); + } else { + // for cycle retry + doCycleRetry(request); + } + } + + private void doCycleRetry(Request request) { + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes < site.getCycleRetryTimes()) { + addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); + } + } + sleep(site.getRetrySleepTime()); } protected void sleep(int time) { try { Thread.sleep(time); } catch (InterruptedException e) { - e.printStackTrace(); + logger.error("Thread interrupted when sleep",e); } } @@ -474,6 +501,7 @@ public class Spider implements Runnable, Task { * Download urls synchronizing. * * @param urls urls + * @param type of process result * @return list downloaded */ public List getAll(Collection urls) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c835dc8b0040e04bf5dd37a4d1eedcc3f54b12b3..c27292d09d8571b3a9ba5d3503c422987a55942c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -41,20 +41,4 @@ public abstract class AbstractDownloader implements Downloader { protected void onError(Request request) { } - protected Page addToCycleRetry(Request request, Site site) { - Page page = new Page(); - Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); - if (cycleTriedTimesObject == null) { - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); - } else { - int cycleTriedTimes = (Integer) cycleTriedTimesObject; - cycleTriedTimes++; - if (cycleTriedTimes >= site.getCycleRetryTimes()) { - return null; - } - page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); - } - page.setNeedCycleRetry(true); - return page; - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e6523ec8096005d31cca9571d102ef7b1ec8f665..24889c88b22b51b236b31f10667c74bff913aaff 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,16 +2,8 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; -import org.apache.http.annotation.ThreadSafe; -import org.apache.http.auth.AuthState; -import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpUriRequest; -import org.apache.http.client.protocol.HttpClientContext; -import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.protocol.BasicHttpContext; -import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,7 +29,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -83,43 +74,29 @@ public class HttpClientDownloader extends AbstractDownloader { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } - logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; - int statusCode = 0; - Site site = task.getSite(); - Proxy proxy = null; - HttpContext httpContext = new BasicHttpContext(); - if (proxyProvider != null) { - proxy = proxyProvider.getProxy(task); - AuthState authState = new AuthState(); - authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); - httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); - } - CloseableHttpClient httpClient = getHttpClient(site); - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); + CloseableHttpClient httpClient = getHttpClient(task.getSite()); + Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); + Page page = Page.fail(); try { - httpResponse = httpClient.execute(httpUriRequest, httpContext); - statusCode = httpResponse.getStatusLine().getStatusCode(); - if (site.getAcceptStatCode().contains(statusCode)) { - Page page = handleResponse(request, site.getCharset(), httpResponse, task); - onSuccess(request); - return page; - } else { - logger.warn("get page {} error, status code {} ",request.getUrl(),statusCode); - return null; - } + httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); + page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); + onSuccess(request); + logger.info("downloading page success {}", request.getUrl()); + return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - if (site != null && site.getCycleRetryTimes() > 0) { - return addToCycleRetry(request, site); - } onError(request); - return null; + return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } + if (proxyProvider != null && proxy != null) { + proxyProvider.returnProxy(proxy, page, task); + } } } @@ -129,34 +106,33 @@ public class HttpClientDownloader extends AbstractDownloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = getContent(charset, httpResponse); + byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); - page.setRawText(content); + page.setBytes(bytes); + if (!request.isBinaryContent()){ + if (charset == null) { + charset = getHtmlCharset(contentType, bytes); + } + page.setCharset(charset); + page.setRawText(new String(bytes, charset)); + } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); + page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } - private String getContent(String charset, HttpResponse httpResponse) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); - String htmlCharset = getHtmlCharset(httpResponse, contentBytes); - if (htmlCharset != null) { - return new String(contentBytes, htmlCharset); - } else { - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); - return new String(contentBytes); - } - } else { - return IOUtils.toString(httpResponse.getEntity().getContent(), charset); + charset = Charset.defaultCharset().name(); + logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); } - } - - private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { - return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); + return charset; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 9e17f607594163594d18be0705d68530dcb3afb4..28a16f41d5eaf8101a9ec463b8d86938e305da12 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -9,6 +9,7 @@ import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; @@ -49,7 +50,9 @@ public class HttpClientGenerator { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书 + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + null, + new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { @@ -127,6 +130,10 @@ public class HttpClientGenerator { } private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { + if (site.isDisableCookieManagement()) { + httpClientBuilder.disableCookieManagement(); + return; + } CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java new file mode 100644 index 0000000000000000000000000000000000000000..74e6d25efadb34b1b4a63392eaf8949b83d896ef --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.client.protocol.HttpClientContext; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + * Time: 19:43 + * @since 0.7.0 + */ +public class HttpClientRequestContext { + + private HttpUriRequest httpUriRequest; + + private HttpClientContext httpClientContext; + + public HttpUriRequest getHttpUriRequest() { + return httpUriRequest; + } + + public void setHttpUriRequest(HttpUriRequest httpUriRequest) { + this.httpUriRequest = httpUriRequest; + } + + public HttpClientContext getHttpClientContext() { + return httpClientContext; + } + + public void setHttpClientContext(HttpClientContext httpClientContext) { + this.httpClientContext = httpClientContext; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index db131d07e83c9d9a46e60f0f653dcb915b264f5a..28a7ce5ea22c9b8827a8c77a3dc0438963c1a612 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -1,33 +1,64 @@ package us.codecraft.webmagic.downloader; import org.apache.http.HttpHost; -import org.apache.http.NameValuePair; +import org.apache.http.auth.AuthState; +import org.apache.http.auth.ChallengeState; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; -import org.apache.http.message.BasicNameValuePair; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.entity.ByteArrayEntity; +import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.cookie.BasicClientCookie; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; +import us.codecraft.webmagic.utils.UrlUtils; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; import java.util.Map; /** * @author code4crafter@gmail.com * Date: 17/3/18 - * Time: 上午11:28 + * Time: 11:28 + * + * @since 0.7.0 */ public class HttpUriRequestConverter { - public HttpUriRequest convert(Request request, Site site, Proxy proxy) { - RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); + public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) { + HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext(); + httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy)); + httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy)); + return httpClientRequestContext; + } + + private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { + HttpClientContext httpContext = new HttpClientContext(); + if (proxy != null && proxy.getUsername() != null) { + AuthState authState = new AuthState(); + authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); + } + if (request.getCookies() != null && !request.getCookies().isEmpty()) { + CookieStore cookieStore = new BasicCookieStore(); + for (Map.Entry cookieEntry : request.getCookies().entrySet()) { + BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); + cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl()))); + cookieStore.addCookie(cookie1); + } + httpContext.setCookieStore(cookieStore); + } + return httpContext; + } + + private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); @@ -46,47 +77,39 @@ public class HttpUriRequestConverter { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); } requestBuilder.setConfig(requestConfigBuilder.build()); - return requestBuilder.build(); + HttpUriRequest httpUriRequest = requestBuilder.build(); + if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { + for (Map.Entry header : request.getHeaders().entrySet()) { + httpUriRequest.addHeader(header.getKey(), header.getValue()); + } + } + return httpUriRequest; } private RequestBuilder selectRequestMethod(Request request) { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get - return addQueryParams(RequestBuilder.get(),request.getParams()); + return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + return addFormParams(RequestBuilder.post(),request); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { - return addQueryParams(RequestBuilder.head(),request.getParams()); + return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { - return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + return addFormParams(RequestBuilder.put(), request); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { - return addQueryParams(RequestBuilder.delete(),request.getParams()); + return RequestBuilder.delete(); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { - return addQueryParams(RequestBuilder.trace(),request.getParams()); + return RequestBuilder.trace(); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } - private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) { - List allNameValuePair=new ArrayList(); - if (nameValuePair != null && nameValuePair.length > 0) { - allNameValuePair= Arrays.asList(nameValuePair); - } - if (params != null) { - for (String key : params.keySet()) { - allNameValuePair.add(new BasicNameValuePair(key, params.get(key))); - } - } - requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8"))); - return requestBuilder; - } - - private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) { - if (params != null) { - for (Map.Entry entry : params.entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } + private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) { + if (request.getRequestBody() != null) { + ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody()); + entity.setContentType(request.getRequestBody().getContentType()); + requestBuilder.setEntity(entity); } return requestBuilder; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java new file mode 100644 index 0000000000000000000000000000000000000000..7d3b307852ce142d7cac3d905f90dc7912f317e5 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -0,0 +1,102 @@ +package us.codecraft.webmagic.model; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URLEncodedUtils; +import org.apache.http.message.BasicNameValuePair; + +import java.io.Serializable; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + */ +public class HttpRequestBody implements Serializable { + + private static final long serialVersionUID = 5659170945717023595L; + + public static abstract class ContentType { + + public static final String JSON = "application/json"; + + public static final String XML = "text/xml"; + + public static final String FORM = "application/x-www-form-urlencoded"; + + public static final String MULTIPART = "multipart/form-data"; + } + + private byte[] body; + + private String contentType; + + private String encoding; + + public HttpRequestBody() { + } + + public HttpRequestBody(byte[] body, String contentType, String encoding) { + this.body = body; + this.contentType = contentType; + this.encoding = encoding; + } + + public String getContentType() { + return contentType; + } + + public String getEncoding() { + return encoding; + } + + public void setBody(byte[] body) { + this.body = body; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + public void setEncoding(String encoding) { + this.encoding = encoding; + } + + public static HttpRequestBody json(String json, String encoding) { + try { + return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } + } + + public static HttpRequestBody xml(String xml, String encoding) { + try { + return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } + } + + public static HttpRequestBody custom(byte[] body, String contentType, String encoding) { + return new HttpRequestBody(body, contentType, encoding); + } + + public static HttpRequestBody form(Map params, String encoding){ + List nameValuePairs = new ArrayList(params.size()); + for (Map.Entry entry : params.entrySet()) { + nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); + } + try { + return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("illegal encoding " + encoding, e); + } + } + + public byte[] getBody() { + return body; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 57d6eea3f86097c2824000bda3a139a025a532a4..be9fd7cc2e35948aa49a397b2c1e4f30202d2b34 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,8 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; @@ -21,7 +19,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java index a2a17e8c210683c48307041bdffe4a674f786738..4c94eef198115dd2dff876e8a81799af537aac19 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java @@ -16,9 +16,9 @@ public class ZhihuPageProcessor implements PageProcessor { @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); - page.putField("title", page.getHtml().xpath("//h2[@class='zm-item-title']/a/text()").toString()); - page.putField("question", page.getHtml().xpath("//div[@id='zh-question-detail']//tidyText()").toString()); - page.putField("answer", page.getHtml().xpath("//div[@id='zh-question-answer-wrap']//div[@class='zm-editable-content']/tidyText()").toString()); + page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); + page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); + page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index a38ccaa7e4fac484e6de2ecde724f7bb17f5adff..c5f100732c03346e7f39490e4aa4b33b2926be42 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -38,4 +38,36 @@ public class Proxy { public String getPassword() { return password; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Proxy proxy = (Proxy) o; + + if (port != proxy.port) return false; + if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; + return password != null ? password.equals(proxy.password) : proxy.password == null; + } + + @Override + public int hashCode() { + int result = host != null ? host.hashCode() : 0; + result = 31 * result + port; + result = 31 * result + (username != null ? username.hashCode() : 0); + result = 31 * result + (password != null ? password.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "Proxy{" + + "host='" + host + '\'' + + ", port=" + port + + ", username='" + username + '\'' + + ", password='" + password + '\'' + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 4266d78c94652a916c02f8c4bab3194fc154d872..5b61a993ac0b533ed0cc3d51f6d0fa2a6f349726 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,14 +1,29 @@ package us.codecraft.webmagic.proxy; +import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; /** - * Created by edwardsbean on 15-2-28. + * Proxy provider.
+ * + * @since 0.7.0 */ public interface ProxyProvider { - void returnProxy(Proxy proxy, boolean banned, Task task); + /** + * + * Return proxy to Provider when complete a download. + * @param proxy the proxy config contains host,port and identify info + * @param page the download result + * @param task the download task + */ + void returnProxy(Proxy proxy, Page page, Task task); + /** + * Get a proxy for task by some strategy. + * @param task the download task + * @return proxy + */ Proxy getProxy(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java deleted file mode 100644 index 3e68c11687c6acc35d85fbf2bb7bfc6fdb18bdcc..0000000000000000000000000000000000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import org.apache.http.HttpResponse; - -/** - * @author code4crafter@gmail.com - * Date: 17/3/20 - * Time: 下午10:52 - */ -public interface ResponseChecker { - - boolean isBanned(HttpResponse httpResponse); -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java new file mode 100644 index 0000000000000000000000000000000000000000..d8f47fe44bd506c1482a18580fc64ed2051c212c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -0,0 +1,62 @@ +package us.codecraft.webmagic.proxy; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable. + * @author code4crafter@gmail.com + * Date: 17/4/16 + * Time: 10:18 + * @since 0.7.0 + */ +public class SimpleProxyProvider implements ProxyProvider { + + private final List proxies; + + private final AtomicInteger pointer; + + public SimpleProxyProvider(List proxies) { + this(proxies, new AtomicInteger(-1)); + } + + private SimpleProxyProvider(List proxies, AtomicInteger pointer) { + this.proxies = proxies; + this.pointer = pointer; + } + + public static SimpleProxyProvider from(Proxy... proxies) { + List proxiesTemp = new ArrayList(proxies.length); + for (Proxy proxy : proxies) { + proxiesTemp.add(proxy); + } + return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp)); + } + + @Override + public void returnProxy(Proxy proxy, Page page, Task task) { + //Donothing + } + + @Override + public Proxy getProxy(Task task) { + return proxies.get(incrForLoop()); + } + + private int incrForLoop() { + int p = pointer.incrementAndGet(); + int size = proxies.size(); + if (p < size) { + return p; + } + while (!pointer.compareAndSet(p, p % size)) { + p = pointer.get(); + } + return p % size; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java deleted file mode 100644 index 7002df47b9eacb6c8cd5df3224a1632b76d0aea9..0000000000000000000000000000000000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java +++ /dev/null @@ -1,159 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Delayed; -import java.util.concurrent.TimeUnit; - -/** - * >>>> Proxy lifecycle - - +----------+ +-----+ - | last use | | new | - +-----+----+ +---+-+ - | +------+ | - +->| init |<--+ - +--+---+ - | - v - +--------+ - +--->| borrow | - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | in use | Respone Time - | +---+----+ - | |+------------------+ - | v - | +--------+ - | | return | - | +---+----+ - | |+-------------------+ - | v - | +-------+ reuse interval - | | delay | (delay time) - | +---+---+ - | |+-------------------+ - | v - | +------+ - | | idle | idle time - | +---+--+ - | |+-------------------+ - +--------+ - */ - -/** - * Object has these status of lifecycle above.
- * - * @author yxssfxwzy@sina.com
- * @since 0.5.1 - * @see TimerReuseProxyPool - */ - -public class TimerReuseProxy extends Proxy implements Delayed, Serializable { - - private static final long serialVersionUID = 228939737383625551L; - public static final int ERROR_403 = 403; - public static final int ERROR_404 = 404; - public static final int ERROR_BANNED = 10000;// banned by website - public static final int ERROR_Proxy = 10001;// the proxy itself failed - public static final int SUCCESS = 200; - - private int reuseTimeInterval = 1500;// ms - private Long canReuseTime = 0L; - private Long lastBorrowTime = System.currentTimeMillis(); - private Long responseTime = 0L; - - private int failedNum = 0; - private int successNum = 0; - private int borrowNum = 0; - - private List failedErrorType = new ArrayList(); - - public TimerReuseProxy(String host, int port, String username, String password) { - super(host, port, username, password); - } - - - public int getSuccessNum() { - return successNum; - } - - public void successNumIncrement(int increment) { - this.successNum += increment; - } - - public Long getLastUseTime() { - return lastBorrowTime; - } - - public void setLastBorrowTime(Long lastBorrowTime) { - this.lastBorrowTime = lastBorrowTime; - } - - public void recordResponse() { - this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; - this.lastBorrowTime = System.currentTimeMillis(); - } - - public List getFailedErrorType() { - return failedErrorType; - } - - public void setFailedErrorType(List failedErrorType) { - this.failedErrorType = failedErrorType; - } - - public void fail(int failedErrorType) { - this.failedNum++; - this.failedErrorType.add(failedErrorType); - } - - public void setFailedNum(int failedNum) { - this.failedNum = failedNum; - } - - public int getFailedNum() { - return failedNum; - } - - public String getFailedType() { - String re = ""; - for (Integer i : this.failedErrorType) { - re += i + " . "; - } - return re; - } - - public int getReuseTimeInterval() { - return reuseTimeInterval; - } - - public void setReuseTimeInterval(int reuseTimeInterval) { - this.reuseTimeInterval = reuseTimeInterval; - this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); - - } - - @Override - public long getDelay(TimeUnit unit) { - return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); - } - - @Override - public int compareTo(Delayed o) { - TimerReuseProxy that = (TimerReuseProxy) o; - return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); - - } - - public void borrowNumIncrement(int increment) { - this.borrowNum += increment; - } - - public int getBorrowNum() { - return borrowNum; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java deleted file mode 100644 index 6dbac5d58d0611a00d3b0d4861834d235bd21661..0000000000000000000000000000000000000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ /dev/null @@ -1,204 +0,0 @@ -package us.codecraft.webmagic.proxy; - -import us.codecraft.webmagic.Task; - -/** - * Pooled Proxy Object - * - * @author yxssfxwzy@sina.com
- * @see Proxy - * @since 0.5.1 - */ -public class TimerReuseProxyPool implements ProxyProvider { - @Override - public void returnProxy(Proxy proxy, boolean banned, Task task) { - - } - - @Override - public Proxy getProxy(Task task) { - return null; - } - -// private Logger logger = LoggerFactory.getLogger(getClass()); -// -// private BlockingQueue proxyQueue = new DelayQueue(); -// private Map allProxy = new ConcurrentHashMap(); -// -// private int reuseInterval = 1500;// ms -// private int reviveTime = 2 * 60 * 60 * 1000;// ms -// private int saveProxyInterval = 10 * 60 * 1000;// ms -// -// private boolean isEnable = false; -// private boolean validateWhenInit = false; -// // private boolean isUseLastProxy = true; -// -// public TimerReuseProxyPool(List httpProxyList) { -// this(httpProxyList, true); -// } -// -// private void addProxy(Map httpProxyMap) { -// isEnable = true; -// for (Entry entry : httpProxyMap.entrySet()) { -// try { -// if (allProxy.containsKey(entry.getKey())) { -// continue; -// } -// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { -// entry.getValue().setFailedNum(0); -// entry.getValue().setReuseTimeInterval(reuseInterval); -// proxyQueue.add(entry.getValue()); -// allProxy.put(entry.getKey(), entry.getValue()); -// } -// } catch (NumberFormatException e) { -// logger.error("HttpHost init error:", e); -// } -// } -// logger.info("proxy pool size>>>>" + allProxy.size()); -// } -// -// public void addProxy(Proxy... httpProxyList) { -// isEnable = true; -// for (Proxy proxy : httpProxyList) { -// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { -// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); -// proxyQueue.add(p); -// allProxy.put(p.getProxyHost().getHost(), p); -// } -// } -// logger.info("proxy pool size>>>>" + allProxy.size()); -// } -// -// public TimerReuseProxy getProxy() { -// TimerReuseProxy proxy = null; -// try { -// Long time = System.currentTimeMillis(); -// proxy = proxyQueue.take(); -// double costTime = (System.currentTimeMillis() - time) / 1000.0; -// if (costTime > reuseInterval) { -// logger.info("get proxy time >>>> " + costTime); -// } -// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); -// p.setLastBorrowTime(System.currentTimeMillis()); -// p.borrowNumIncrement(1); -// } catch (InterruptedException e) { -// logger.error("get proxy error", e); -// } -// if (proxy == null) { -// throw new NoSuchElementException(); -// } -// return proxy; -// } -// -// public void returnProxy(Proxy proxy, int statusCode) { -// TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); -// if (p == null) { -// return; -// } -// switch (statusCode) { -// case TimerReuseProxy.SUCCESS: -// p.setReuseTimeInterval(reuseInterval); -// p.setFailedNum(0); -// p.setFailedErrorType(new ArrayList()); -// p.recordResponse(); -// p.successNumIncrement(1); -// break; -// case TimerReuseProxy.ERROR_403: -// // banned,try longer interval -// p.fail(TimerReuseProxy.ERROR_403); -// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); -// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); -// break; -// case TimerReuseProxy.ERROR_BANNED: -// p.fail(TimerReuseProxy.ERROR_BANNED); -// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); -// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); -// break; -// case TimerReuseProxy.ERROR_404: -// // p.fail(Proxy.ERROR_404); -// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); -// break; -// default: -// p.fail(statusCode); -// break; -// } -// if (p.getFailedNum() > 20) { -// p.setReuseTimeInterval(reviveTime); -// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); -// return; -// } -// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { -// if (!ProxyUtils.validateProxy(proxy)) { -// p.setReuseTimeInterval(reviveTime); -// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); -// return; -// } -// } -// try { -// proxyQueue.put(p); -// } catch (InterruptedException e) { -// logger.warn("proxyQueue return proxy error", e); -// } -// } -// -// public String allProxyStatus() { -// String re = "all proxy info >>>> \n"; -// for (Entry entry : allProxy.entrySet()) { -// re += entry.getValue().toString() + "\n"; -// } -// return re; -// } -// -// public int getIdleNum() { -// return proxyQueue.size(); -// } -// -// public int getReuseInterval() { -// return reuseInterval; -// } -// -// public void setReuseInterval(int reuseInterval) { -// this.reuseInterval = reuseInterval; -// } -// -// public void enable(boolean isEnable) { -// this.isEnable = isEnable; -// } -// -// public boolean isEnable() { -// return isEnable; -// } -// -// public int getReviveTime() { -// return reviveTime; -// } -// -// public void setReviveTime(int reviveTime) { -// this.reviveTime = reviveTime; -// } -// -// public boolean isValidateWhenInit() { -// return validateWhenInit; -// } -// -// public void validateWhenInit(boolean validateWhenInit) { -// this.validateWhenInit = validateWhenInit; -// } -// -// public int getSaveProxyInterval() { -// return saveProxyInterval; -// } -// -// public void setSaveProxyInterval(int saveProxyInterval) { -// this.saveProxyInterval = saveProxyInterval; -// } -// -// public String getProxyFilePath() { -// return proxyFilePath; -// } -// -// public void setProxyFilePath(String proxyFilePath) { -// this.proxyFilePath = proxyFilePath; -// } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 8fa1b9ea29996c884b98614ebc79d80711d8d2fb..14cbaff327a3b9778ddd08372746a606e5577fd1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; @@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue; * @author code4crafter@gmail.com
* @since 0.2.1 */ -@ThreadSafe public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 078506c6f9e09d4b1d1392275b8c0a064cf065d4..f9ad0e98f8fb0cff89d28248e852e417b5cd229d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index d80e8b48e813b8a4e08a65a5e1299b6229618d10..f2218f12611a5dadeba6c651a740132cb698677b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.nodes.Entities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,33 +19,28 @@ public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); - private static volatile boolean INITED = false; - /** * Disable jsoup html entity escape. It can be set just before any Html instance is created. + * @deprecated */ public static boolean DISABLE_HTML_ENTITY_ESCAPE = false; - /** - * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2. - */ - private void disableJsoupHtmlEntityEscape() { - if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { - Entities.EscapeMode.base.getMap().clear(); - Entities.EscapeMode.extended.getMap().clear(); - Entities.EscapeMode.xhtml.getMap().clear(); - INITED = true; - } - } - /** * Store parsed document for better performance when only one text exist. */ private Document document; + public Html(String text, String url) { + try { + this.document = Jsoup.parse(text, url); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); + } + } + public Html(String text) { try { - disableJsoupHtmlEntityEscape(); this.document = Jsoup.parse(text); } catch (Exception e) { this.document = null; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 030522f0ab3f8f32e45218896396dae66bff5126..c063b48259e2f8ab3fec807b87ca7322a75b8a6d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable { @Override public Selectable links() { - return xpath("//a/@href"); + return selectElements(new LinksSelector()); } @Override @@ -90,7 +90,7 @@ public class HtmlNode extends AbstractSelectable { * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator - * @param element element + * @return element element */ private Element checkElementAndConvert(ListIterator elementIterator) { Element element = elementIterator.next(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index b0b90f9bf2fcc75445184a7d379eccc47f3bdd41..f5c0baeb591a132d8272dc795e537f77f8c03c89 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic.selector; +import com.alibaba.fastjson.JSON; import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * JsonPath selector.
@@ -32,12 +34,20 @@ public class JsonPathSelector implements Selector { if (object instanceof List) { List list = (List) object; if (list != null && list.size() > 0) { - return list.iterator().next().toString(); + return toString(list.iterator().next()); } } return object.toString(); } + private String toString(Object object) { + if (object instanceof Map) { + return JSON.toJSONString(object); + } else { + return String.valueOf(object); + } + } + @Override public List selectList(String text) { List list = new ArrayList(); @@ -48,10 +58,10 @@ public class JsonPathSelector implements Selector { if (object instanceof List) { List items = (List) object; for (Object item : items) { - list.add(String.valueOf(item)); + list.add(toString(item)); } } else { - list.add(String.valueOf(object)); + list.add(toString(object)); } return list; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java new file mode 100644 index 0000000000000000000000000000000000000000..5296a74bdaec2e3fbf6087ddf8b85328d6191fe8 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * Links selector based on jsoup. Use absolute url.
+ * + * @author code4crafter@gmail.com
+ * @since 0.7.0 + */ +public class LinksSelector extends BaseElementSelector { + + @Override + public String select(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectList(Element element) { + Elements elements = element.select("a"); + List links = new ArrayList(elements.size()); + for (Element element0 : elements) { + if (!StringUtil.isBlank(element0.baseUri())) { + links.add(element0.attr("abs:href")); + } else { + links.add(element0.attr("href")); + } + } + return links; + } + + @Override + public Element selectElement(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public List selectElements(Element element) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasAttribute() { + return true; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 584cf900e4ac93b2be89d0f480f487bcac99ad5c..9ae538c0ff42788df0eac8f62c97421f32a8604c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -23,49 +23,33 @@ public class RegexSelector implements Selector { private int group = 1; public RegexSelector(String regexStr, int group) { + this.compileRegex(regexStr); + this.group = group; + } + + private void compileRegex(String regexStr) { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - // Check bracket for regex group. Add default group 1 if there is no group. - // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. - if ( ! hasGroup(regexStr) ){ - regexStr = "(" + regexStr + ")"; - } - this.regexStr = regexStr; try { - regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + this.regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + this.regexStr = regexStr; } catch (PatternSyntaxException e) { - throw new IllegalArgumentException("invalid regex", e); + throw new IllegalArgumentException("invalid regex "+regexStr, e); } - this.group = group; } + /** + * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1. + * @param regexStr + */ public RegexSelector(String regexStr) { - this(regexStr, 1); - } - - private boolean hasGroup(String regexStr) { - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { - return false; + this.compileRegex(regexStr); + if (regex.matcher("").groupCount() == 0) { + this.group = 0; + } else { + this.group = 1; } - return true; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index 50b4f1b60a83a523c4bbfc97b2edac8691cd5bef..ccf00a466f5b2063685770b7c852c61115687560 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -26,7 +26,7 @@ public abstract class CharsetUtils { // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); - if (StringUtils.isNotBlank(contentType)) { + if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index 2a76ecca24472cecc3cfd9a04cf1e1e416da781e..2d6b8fe2a2b257d67ed35c1e117d341b4c55cc5c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -25,6 +25,12 @@ public abstract class HttpConstant { } + public static abstract class StatusCode { + + public static final int CODE_200 = 200; + + } + public static abstract class Header { public static final String REFERER = "Referer"; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index ed7ae8c1598584803e773f7b0da7751cab4f3bd6..c61483a39c5dd142df3f94605932c812c3799dbc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -43,7 +43,7 @@ public class UrlUtils { if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); - return encodeIllegalCharacterInUrl(abs.toExternalForm()); + return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } @@ -53,12 +53,17 @@ public class UrlUtils { * * @param url url * @return new url + * @deprecated */ public static String encodeIllegalCharacterInUrl(String url) { - //TODO more charator support return url.replace(" ", "%20"); } + public static String fixIllegalCharacterInUrl(String url) { + //TODO more charator support + return url.replace(" ", "%20").replaceAll("#+", "#"); + } + public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); @@ -80,7 +85,7 @@ public class UrlUtils { if (i > 0) { domain = StringUtils.substring(domain, 0, i); } - return domain; + return removePort(domain); } public static String removePort(String domain) { @@ -92,41 +97,6 @@ public class UrlUtils { } } - /** - * allow blank space in quote - */ - private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); - - /** - * disallow blank space without quote - */ - private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); - - public static String fixAllRelativeHrefs(String html, String url) { - html = replaceByPattern(html, url, patternForHrefWithQuote); - html = replaceByPattern(html, url, patternForHrefWithoutQuote); - return html; - } - - public static String replaceByPattern(String html, String url, Pattern pattern) { - StringBuilder stringBuilder = new StringBuilder(); - Matcher matcher = pattern.matcher(html); - int lastEnd = 0; - boolean modified = false; - while (matcher.find()) { - modified = true; - stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); - stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); - lastEnd = matcher.end(); - } - if (!modified) { - return html; - } - stringBuilder.append(StringUtils.substring(html, lastEnd)); - return stringBuilder.toString(); - } - public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { @@ -143,7 +113,7 @@ public class UrlUtils { return urlList; } - private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 6cf5382080bce945786f63267032240b5e20829d..f42f68d741123068bc48a71433adcddffd14b826 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -30,7 +30,6 @@ public class HtmlTest { @Test public void testEnableJsoupHtmlEntityEscape() throws Exception { - Html.DISABLE_HTML_ENTITY_ESCAPE = false; Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); } @@ -48,4 +47,14 @@ public class HtmlTest { Selectable selectable = html.xpath("//a[1]").nodes().get(0); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); } + + @Test + public void testGetHrefsByJsoup(){ + Html html = new Html("issues","https://github.com/code4craft/webmagic/"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + html = new Html("issues"); + assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); + assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index e600bf98be3b32886eccd36534b157bb1d006e07..ece060003316e0b79980a30a5adaf75b7ca86d25 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import com.github.dreamhead.moco.HttpServer; import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; +import org.apache.commons.collections.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -14,12 +15,16 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HttpRequestBody; +import us.codecraft.webmagic.proxy.Proxy; +import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.util.Map; import static com.github.dreamhead.moco.Moco.*; import static org.assertj.core.api.Assertions.assertThat; @@ -31,7 +36,7 @@ import static org.junit.Assert.assertTrue; */ public class HttpClientDownloaderTest { - public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13421/404"; + public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13423/404"; @Test public void testDownloader() { @@ -47,41 +52,38 @@ public class HttpClientDownloaderTest { } @Test - public void testCycleTriedTimes() { + public void test_download_fail() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask(); Request request = new Request(PAGE_ALWAYS_NOT_EXISTS); Page page = httpClientDownloader.download(request, task); - assertThat(page.getTargetRequests().size() > 0); - assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(1); - page = httpClientDownloader.download(page.getTargetRequests().get(0), task); - assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2); + assertThat(page.isDownloadSuccess()).isFalse(); } @Test public void testGetHtmlCharset() throws Exception { - HttpServer server = httpserver(12306); + HttpServer server = httpServer(13423); server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk")); server.get(by(uri("/meta4"))).response(with(text("\n" + " \n" + " \n" + " \n" + " \n" + - "")),header("Content-Type","")); + "")),header("Content-Type","text/html; charset=gbk")); server.get(by(uri("/meta5"))).response(with(text("\n" + " \n" + " \n" + " \n" + " \n" + - "")),header("Content-Type","")); + "")),header("Content-Type","text/html")); Runner.running(server, new Runnable() { @Override public void run() { - String charset = getCharsetByUrl("http://127.0.0.1:12306/header"); + String charset = getCharsetByUrl("http://127.0.0.1:13423/header"); assertEquals(charset, "gbk"); - charset = getCharsetByUrl("http://127.0.0.1:12306/meta4"); + charset = getCharsetByUrl("http://127.0.0.1:13423/meta4"); assertEquals(charset, "gbk"); - charset = getCharsetByUrl("http://127.0.0.1:12306/meta5"); + charset = getCharsetByUrl("http://127.0.0.1:13423/meta5"); assertEquals(charset, "gbk"); } @@ -93,7 +95,7 @@ public class HttpClientDownloaderTest { Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { - httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null)); + httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest()); } catch (IOException e) { e.printStackTrace(); } @@ -111,7 +113,8 @@ public class HttpClientDownloaderTest { @Test public void test_selectRequestMethod() throws Exception { - HttpServer server = httpserver(12306); + final int port = 13423; + HttpServer server = httpServer(port); server.get(eq(query("q"), "webmagic")).response("get"); server.post(eq(form("q"), "webmagic")).response("post"); server.put(eq(form("q"), "webmagic")).response("put"); @@ -124,43 +127,200 @@ public class HttpClientDownloaderTest { @Override public void run() throws Exception { Request request = new Request(); - request.setUrl("http://127.0.0.1:12306/search"); - request.putParams("q", "webmagic"); + request.setUrl("http://127.0.0.1:" + port + "/search?q=webmagic"); request.setMethod(HttpConstant.Method.GET); - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null); + Map params = new HashedMap(); + params.put("q","webmagic"); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); - request.setMethod(HttpConstant.Method.POST); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); - request.setMethod(HttpConstant.Method.PUT); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); request.setMethod(HttpConstant.Method.DELETE); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); request.setMethod(HttpConstant.Method.HEAD); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head"); request.setMethod(HttpConstant.Method.TRACE); - httpUriRequest = httpUriRequestConverter.convert(request, site, null); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); + request.setUrl("http://127.0.0.1:" + port + "/search"); + request.setMethod(HttpConstant.Method.POST); + request.setRequestBody(HttpRequestBody.form(params, "utf-8")); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); + request.setMethod(HttpConstant.Method.PUT); + httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); + } + }); + } + + @Test + public void test_set_request_cookie() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + request.addCookie("cookie","cookie-webmagic"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_disableCookieManagement() throws Exception { + HttpServer server = httpServer(13423); + server.get(not(eq(cookie("cookie"), "cookie-webmagic"))).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + request.addCookie("cookie","cookie-webmagic"); + Page page = httpClientDownloader.download(request, Site.me().setDisableCookieManagement(true).toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_set_request_header() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(header("header"), "header-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + request.addHeader("header","header-webmagic"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_set_site_header() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(header("header"), "header-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + Page page = httpClientDownloader.download(request, Site.me().addHeader("header","header-webmagic").toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_set_site_cookie() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423"); + Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1"); + Page page = httpClientDownloader.download(request, site.toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_download_when_task_is_null() throws Exception { - HttpServer server = httpserver(12306); + HttpServer server = httpServer(13423); server.response("foo"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); - request.setUrl("http://127.0.0.1:12306/"); + request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("foo"); } }); } + + @Test + public void test_download_auth_by_SimpleProxyProvider() throws Exception { + HttpServer server = httpServer(13423); + server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password"))); + Request request = new Request(); + request.setUrl("http://www.baidu.com"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isEqualTo("ok"); + } + }); + } + + @Test + public void test_download_binary_content() throws Exception { + HttpServer server = httpServer(13423); + server.response("binary"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setBinaryContent(true); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getRawText()).isNull(); + assertThat(page.getBytes()).isEqualTo("binary".getBytes()); + } + }); + } + + @Test + public void test_download_set_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response(header("Content-Type","text/html; charset=utf-8")).response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + + @Test + public void test_download_set_request_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setCharset("utf-8"); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().setCharset("gbk").toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java new file mode 100644 index 0000000000000000000000000000000000000000..e7da1b9ab3da1cc7a95607240762e3c6b617ef6c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.net.URI; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/7/22 + * Time: 下午5:29 + */ +public class HttpUriRequestConverterTest { + + @Test + public void test_illegal_uri_correct() throws Exception { + HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null); + assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#")); + } +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java new file mode 100644 index 0000000000000000000000000000000000000000..861b315a631941093af0b57acf5f7feb1a342704 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/11/29 + * Time: 下午1:32 + */ +public class SSLCompatibilityTest { + + @Test + public void test_tls12() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Task task = Site.me().setCycleRetryTimes(5).toTask(); + Request request = new Request("https://juejin.im/"); + Page page = httpClientDownloader.download(request, task); + assertThat(page.isDownloadSuccess()).isTrue(); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java new file mode 100644 index 0000000000000000000000000000000000000000..6495b16bff3d3990942ae404dedf84d1654fbe58 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.proxy; + +import org.junit.Test; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/16 + * Time: 上午10:29 + */ +public class SimpleProxyProviderTest { + + public static final Task TASK = Site.me().toTask(); + + @Test + public void test_get_proxy() throws Exception { + Proxy originProxy1 = new Proxy("127.0.0.1", 1087); + Proxy originProxy2 = new Proxy("127.0.0.1", 1088); + SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); + Proxy proxy = proxyProvider.getProxy(TASK); + assertThat(proxy).isEqualTo(originProxy1); + proxy = proxyProvider.getProxy(TASK); + assertThat(proxy).isEqualTo(originProxy2); + proxy = proxyProvider.getProxy(TASK); + assertThat(proxy).isEqualTo(originProxy1); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java index b88e51ee6d6bb11fae3f7fa052f8d2e4d4619f36..6dff0faafa96386040f1dac8219316477055016d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java @@ -52,4 +52,5 @@ public class JsonPathSelectorTest { JSONObject object2=JSON.parseObject("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}"); assertThat(object1).isEqualTo(object2); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java new file mode 100644 index 0000000000000000000000000000000000000000..75a291348870825af9c2b69bb409f10a6fec188f --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.junit.Test; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * Date: 17/4/8 + * Time: 下午9:41 + */ +public class LinksSelectorTest { + + private String html = "
"; + + @Test + public void testLinks() throws Exception { + LinksSelector linksSelector = new LinksSelector(); + List links = linksSelector.selectList(html); + System.out.println(links); + + html = "
"; + links = linksSelector.selectList(Jsoup.parse(html, "http://whatever.com/")); + System.out.println(links); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 144e6fe2c1b173226470d45f7799dcd4d82bfbc3..871caa144c54a760daa4803fde19af57efc4c3df 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -25,8 +25,8 @@ public class RegexSelectorTest { @Test public void testRegexWithZeroWidthAssertions() { - String regex = "^.*(?=\\?)"; - String source = "hello world?xxxx"; + String regex = "^.*(?=\\?)(?!\\?yy)"; + String source = "hello world?xx?yy"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo("hello world"); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index a90304dcd1a5b34741162e4cb5770bdd5f94ede2..6afdeefe4042d6b97b899a11354ff70806cce111 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -33,25 +33,6 @@ public class UrlUtilsTest { assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); } - @Test - public void testFixAllRelativeHrefs() { - String originHtml = ""; - String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - } - @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 08488178a075fb32c1a7be6f7d2e2e5af5f58284..7e949ca6f9fb43866b32da1705e835529e8339e0 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 diff --git a/webmagic-extension/pom.xml.versionsBackup b/webmagic-extension/pom.xml.versionsBackup deleted file mode 100644 index 47496eca6a8ec30d85abdc5d84fdb46ff4f032ac..0000000000000000000000000000000000000000 --- a/webmagic-extension/pom.xml.versionsBackup +++ /dev/null @@ -1,29 +0,0 @@ - - - - us.codecraft - webmagic-parent - 0.5.2 - - 4.0.0 - - webmagic-extension - - - - redis.clients - jedis - 2.0.0 - - - us.codecraft - webmagic-core - ${project.version} - - - junit - junit - - - - \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java new file mode 100644 index 0000000000000000000000000000000000000000..67f4348ed8af67cdbc8d11632d65fc5e4799b534 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.model.PageMapper; +import us.codecraft.webmagic.proxy.ProxyProvider; + +/** + * @author code4crafter@gmail.com + * Date: 2017/5/27 + * @since 0.7.0 + */ +public class SimpleHttpClient { + + private final HttpClientDownloader httpClientDownloader; + + private final Site site; + + public SimpleHttpClient() { + this(Site.me()); + } + + public SimpleHttpClient(Site site) { + this.site = site; + this.httpClientDownloader = new HttpClientDownloader(); + } + + public void setProxyProvider(ProxyProvider proxyProvider){ + this.httpClientDownloader.setProxyProvider(proxyProvider); + } + + public T get(String url, Class clazz) { + return get(new Request(url), clazz); + } + + public T get(Request request, Class clazz) { + Page page = httpClientDownloader.download(request, site.toTask()); + if (!page.isDownloadSuccess()) { + return null; + } + return new PageMapper(clazz).get(page); + } + + public Page get(String url) { + return httpClientDownloader.download(new Request(url), site.toTask()); + } + + public Page get(Request request) { + return httpClientDownloader.download(request, site.toTask()); + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index fd0cc479bbab1760b6e68185530327149744b698..6055bdb0f87271b1edf9cddbbb74967d16e3b48a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -16,7 +15,6 @@ import java.io.*; * @author dolphineor@gmail.com * @version 0.5.3 */ -@ThreadSafe public class PhantomJSDownloader extends AbstractDownloader { private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); @@ -38,7 +36,7 @@ public class PhantomJSDownloader extends AbstractDownloader { * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException * - * @param phantomJsCommand + * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index 738d4a7d5ca88c2bda58dceabb50102b1f585a90..844c775f4e5001ff9b9a1b364fb9b4a7cadfa310 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -19,7 +19,7 @@ import java.util.List; @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) public class GithubRepo implements HasKey { - @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true) private String name; @ExtractByUrl("https://github\\.com/(\\w+)/.*") diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 6bfe88d7dcfeec1bc5a6318217e80f465f2feac3..1c1ed6e8200ae8273025b646a79179d6f5038d7e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -23,6 +23,8 @@ class ModelPageProcessor implements PageProcessor { private Site site; + private boolean extractLinks = true; + public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { @@ -45,8 +47,10 @@ class ModelPageProcessor implements PageProcessor { @Override public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { - extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); - extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + if (extractLinks) { + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + } Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { continue; @@ -70,7 +74,7 @@ class ModelPageProcessor implements PageProcessor { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { - page.addTargetRequest(new Request(matcher.group(1))); + page.addTargetRequest(new Request(matcher.group(0))); } } } @@ -83,4 +87,12 @@ class ModelPageProcessor implements PageProcessor { public Site getSite() { return site; } + + public boolean isExtractLinks() { + return extractLinks; + } + + public void setExtractLinks(boolean extractLinks) { + this.extractLinks = extractLinks; + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 08dc64a10285e03a50eaaaec3b2a7f3c967a5c66..eaabcca2ce94635098c09239f9976c032a500744 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -97,4 +97,9 @@ public class OOSpider extends Spider { return this; } + public OOSpider setIsExtractLinks(boolean isExtractLinks){ + modelPageProcessor.setExtractLinks(isExtractLinks); + return this; + } + } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a1da94bde02305772e519c3e4c638a7308469f41..1e25a46c06e9dd3d237e070bab3f1b85dc6c4555 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -5,9 +5,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.BasicTypeFormatter; import us.codecraft.webmagic.model.formatter.ObjectFormatter; -import us.codecraft.webmagic.model.formatter.ObjectFormatters; +import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -20,6 +19,8 @@ import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; +import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText; + /** * The main internal logic of page model extractor. * @@ -70,65 +71,12 @@ class PageModelExtractor { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { - checkFormat(field, fieldExtractor); + fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build()); fieldExtractors.add(fieldExtractor); } } } - private void checkFormat(Field field, FieldExtractor fieldExtractor) { - //check custom formatter - Formatter formatter = field.getAnnotation(Formatter.class); - if (formatter != null && !formatter.formatter().equals(ObjectFormatter.class)) { - if (formatter != null) { - if (!formatter.formatter().equals(ObjectFormatter.class)) { - ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); - objectFormatter.initParam(formatter.value()); - fieldExtractor.setObjectFormatter(objectFormatter); - return; - } - } - } - if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { - Class fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType()); - ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz, formatter); - if (objectFormatter == null) { - throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); - } else { - fieldExtractor.setObjectFormatter(objectFormatter); - } - } else if (fieldExtractor.isMulti()) { - if (!List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } - if (formatter != null) { - if (!formatter.subClazz().equals(Void.class)) { - ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz(), formatter); - if (objectFormatter == null) { - throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); - } else { - fieldExtractor.setObjectFormatter(objectFormatter); - } - } - } - } - } - - private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz, Formatter formatter) { - return initFormatter(ObjectFormatters.get(fieldClazz)); - } - - private ObjectFormatter initFormatter(Class formatterClazz) { - try { - return formatterClazz.newInstance(); - } catch (InstantiationException e) { - logger.error("init ObjectFormatter fail", e); - } catch (IllegalAccessException e) { - logger.error("init ObjectFormatter fail", e); - } - return null; - } - private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); @@ -179,10 +127,13 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - + ExtractBy.Source source0 = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath){ + source0 = RawText; + } FieldExtractor.Source source = null; - switch (extractBy.source()){ - case RawText: + switch (source0){ + case RawText: source = FieldExtractor.Source.RawText; break; case RawHtml: @@ -197,11 +148,8 @@ class PageModelExtractor { } fieldExtractor = new FieldExtractor(field, selector, source, - extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType())); - Method setterMethod = getSetterMethod(clazz, field); - if (setterMethod != null) { - fieldExtractor.setSetterMethod(setterMethod); - } + extractBy.notNull(), List.class.isAssignableFrom(field.getType())); + fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); } return fieldExtractor; } @@ -220,12 +168,12 @@ class PageModelExtractor { private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { - targetUrlPatterns.add(Pattern.compile("(.*)")); + targetUrlPatterns.add(Pattern.compile(".*")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); @@ -236,7 +184,7 @@ class PageModelExtractor { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { - helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java index f7edffd5ae41f6eeeae226cf69ebb3a42ebda15c..eb7ecb488711fdc2eb8daa1d97aa33ce405268ef 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java @@ -16,6 +16,8 @@ import java.lang.annotation.Target; @Target({ElementType.FIELD}) public @interface Formatter { + Class DEFAULT_FORMATTER = ObjectFormatter.class; + /** * Set formatter params. * diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java new file mode 100644 index 0000000000000000000000000000000000000000..4c32dfc62e2bd107d07ccd7c2f177350c7ca8f5b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java @@ -0,0 +1,56 @@ +package us.codecraft.webmagic.model.formatter; + +import us.codecraft.webmagic.model.annotation.Formatter; + +import java.lang.reflect.Field; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.7.0 + * Date: 2017/6/3 + */ +public class ObjectFormatterBuilder { + + private Field field; + + public ObjectFormatterBuilder setField(Field field) { + this.field = field; + return this; + } + + private ObjectFormatter initFormatterForType(Class fieldClazz, String[] params) { + if (fieldClazz.equals(String.class) || List.class.isAssignableFrom(fieldClazz)){ + return null; + } + Class formatterClass = ObjectFormatters.get(BasicTypeFormatter.detectBasicClass(fieldClazz)); + if (formatterClass == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); + } + return initFormatter(formatterClass, params); + } + + private ObjectFormatter initFormatter(Class formatterClazz, String[] params) { + try { + ObjectFormatter objectFormatter = formatterClazz.newInstance(); + objectFormatter.initParam(params); + return objectFormatter; + } catch (InstantiationException e) { + throw new RuntimeException(e); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + public ObjectFormatter build() { + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null && !formatter.formatter().equals(Formatter.DEFAULT_FORMATTER)) { + return initFormatter(formatter.formatter(), formatter.value()); + } + if (formatter == null || formatter.subClazz().equals(Void.class)) { + return initFormatterForType(field.getType(), formatter != null ? formatter.value() : null); + } else { + return initFormatterForType(formatter.subClazz(), formatter.value()); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java index 7534e5ea899a031847b85edcce4493c5cd176e71..42747e718cbcb227dcb52734be409d55fd8f40be 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java @@ -22,9 +22,9 @@ public class ObjectFormatters { try { formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter); } catch (InstantiationException e) { - e.printStackTrace(); + throw new RuntimeException(e); } catch (IllegalAccessException e) { - e.printStackTrace(); + throw new RuntimeException(e); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 59f4b3f4338024f5a484a72dd3fdf3479d509663..c70d88507dc2e0849505496fdda578dc6fe2201f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.scheduler; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -48,7 +49,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor public boolean isDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { - return jedis.sadd(getSetKey(task), request.getUrl()) > 0; + return jedis.sadd(getSetKey(task), request.getUrl()) == 0; } finally { pool.returnResource(jedis); } @@ -60,14 +61,41 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (request.getExtras() != null) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { - pool.returnResource(jedis); + jedis.close(); + } + } + + private boolean checkForAdditionalInfo(Request request) { + if (request == null) { + return false; + } + + if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { + return true; + } + + if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { + return true; + } + + if (request.isBinaryContent() || request.getRequestBody() != null) { + return true; } + + if (request.getExtras() != null && !request.getExtras().isEmpty()) { + return true; + } + if (request.getPriority() != 0L) { + return true; + } + + return false; } @Override @@ -85,7 +113,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Request o = JSON.parseObject(new String(bytes), Request.class); return o; } - Request request = new Request(url); + Request request = new Request(url); return request; } finally { pool.returnResource(jedis); @@ -100,8 +128,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor return QUEUE_PREFIX + task.getUUID(); } - protected String getItemKey(Task task) - { + protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 54a4439d37aa24d4779ba3c1a5401a37bbf39c2e..d3fc42313ee6cf612646bc9fe02ee320acb41a65 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -25,22 +25,17 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = getXpathSelector(value); + selector = new XpathSelector(value); break; case JsonPath: selector = new JsonPathSelector(value); break; default: - selector = getXpathSelector(value); + selector = new XpathSelector(value); } return selector; } - private static Selector getXpathSelector(String value) { - Selector selector = new XpathSelector(value); - return selector; - } - public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); if (extractBies == null) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..135dc40d72449d92361cb6cd873a7ae0cfecdb2e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.utils; + +import us.codecraft.webmagic.Request; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/5 + * Time: 下午4:58 + */ +public abstract class RequestUtils { + + private static Pattern p4Range = Pattern.compile("\\[(\\d+)\\-(\\d+)\\]"); + + public static List from(String exp){ + Matcher matcher = p4Range.matcher(exp); + if (!matcher.find()) { + return Collections.singletonList(new Request(exp)); + } + int rangeFrom = Integer.parseInt(matcher.group(1)); + int rangeTo = Integer.parseInt(matcher.group(2)); + if (rangeFrom > rangeTo) { + return Collections.emptyList(); + } + List requests = new ArrayList(rangeTo - rangeFrom + 1); + for (int i = rangeFrom; i <= rangeTo; i++) { + requests.add(new Request(matcher.replaceAll(String.valueOf(i)))); + } + return requests; + } + +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java new file mode 100644 index 0000000000000000000000000000000000000000..41a33cd178e4aa295cf438cb25d8720e2f9e5145 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.model.AfterExtractor; +import us.codecraft.webmagic.model.annotation.ExtractBy; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午2:54 + */ +public class SimpleHttpClientTest { + + public static class Weather implements AfterExtractor { + + private String location; + + @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/i/regex('([\\-\\d]+)',1)") + private Integer lowTemperature; + + @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/span/regex('([\\-\\d]+)',1)") + private Integer highTemperature; + + @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='wea']/text()") + private String desc; + + @Override + public void afterProcess(Page page) { + if (lowTemperature > highTemperature) { + int temp = lowTemperature; + lowTemperature = highTemperature; + highTemperature = temp; + } + } + + public String getLocation() { + return location; + } + + public void setLocation(String location) { + this.location = location; + } + + public Integer getLowTemperature() { + return lowTemperature; + } + + public void setLowTemperature(Integer lowTemperature) { + this.lowTemperature = lowTemperature; + } + + public Integer getHighTemperature() { + return highTemperature; + } + + public void setHighTemperature(Integer highTemperature) { + this.highTemperature = highTemperature; + } + + public String getDesc() { + return desc; + } + + public void setDesc(String desc) { + this.desc = desc; + } + + @Override + public String toString() { + return "Weather{" + + "location='" + location + '\'' + + ", lowTemperature=" + lowTemperature + + ", highTemperature=" + highTemperature + + ", desc='" + desc + '\'' + + '}'; + } + } + + @Ignore + @Test + public void test() throws Exception { + Weather weather = new SimpleHttpClient(Site.me()).get("http://www.weather.com.cn/weather/101020100.shtml", Weather.class); + assertThat(weather).isNotNull(); + } + +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 6baee728a9846de76d4e1fe41455e2fdd6da52e1..91e3698cf4ced155d441e85b3fdd19fd005f080a 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; /** @@ -937,7 +936,8 @@ public class MockGithubDownloader implements Downloader{ @Override public Page download(Request request, Task task) { Page page = new Page(); - page.setHtml(new Html(html)); + page.setRawText(html); + page.setStatusCode(200); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java index a621e2dcb1d420bca249472ac0db23bc123f8f23..fa276cbbf3e812613f3bf969549cc5d17c363b0e 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java @@ -1,10 +1,14 @@ package us.codecraft.webmagic.formatter; +import org.apache.commons.lang3.time.DateFormatUtils; +import org.apache.commons.lang3.time.DateUtils; import org.junit.Test; import us.codecraft.webmagic.model.formatter.DateFormatter; import java.util.Date; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmail.com */ @@ -13,8 +17,10 @@ public class DateFormatterTest { @Test public void testDateFormatter() throws Exception { DateFormatter dateFormatter = new DateFormatter(); - dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"}); - Date format = dateFormatter.format("2013-09-10 22:11"); - System.out.println(format); + String pattern = "yyyy-MM-dd HH:mm"; + Date date = DateUtils.parseDate("2013-09-10 22:11", new String[]{pattern}); + dateFormatter.initParam(new String[]{pattern}); + Date format = dateFormatter.format(DateFormatUtils.format(date, pattern)); + assertThat(format).isEqualTo(date); } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java new file mode 100644 index 0000000000000000000000000000000000000000..37506451e69de6c5664b04041229f713ea236a7c --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.model.annotation.ExtractBy; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:07 + */ +public class GithubRepoApi { + + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name",source = ExtractBy.Source.RawText) + private String name; + + public String getName() { + return name; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index 1e9fd52571cf58e4f7c5aff7133fecf9b84455b0..632dd8697a7250cb44bf8f7f497d68f42738cd07 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -4,6 +4,7 @@ import org.junit.Test; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.MockGithubDownloader; +import us.codecraft.webmagic.example.GithubRepo; import us.codecraft.webmagic.pipeline.PageModelPipeline; import static org.assertj.core.api.Assertions.assertThat; @@ -24,4 +25,5 @@ public class GithubRepoTest { } }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } + } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java deleted file mode 100644 index 653105347f20c86cc32a2c33a268b8f4bab1c8e3..0000000000000000000000000000000000000000 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.model; - -import us.codecraft.webmagic.model.annotation.HelpUrl; -import us.codecraft.webmagic.model.annotation.TargetUrl; - -/** - * @author code4crafer@gmail.com - */ -@TargetUrl(value = "http://webmagic.io/post/\\d+",sourceRegion = "//li[@class='post']") -@HelpUrl(value = "http://webmagic.io/list/\\d+",sourceRegion = "//li[@class='list']") -public class MockModel { - -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 7733d4c68dc2034c85754854e12299628512257c..627fa6e84be903702e392de93fbecc45b8b5131d 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -1,15 +1,13 @@ package us.codecraft.webmagic.model; -import org.apache.commons.io.IOUtils; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; - import static org.assertj.core.api.Assertions.assertThat; /** @@ -18,6 +16,8 @@ import static org.assertj.core.api.Assertions.assertThat; */ public class ModelPageProcessorTest { + private PageMocker pageMocker = new PageMocker(); + @TargetUrl("http://codecraft.us/foo") public static class ModelFoo { @@ -34,6 +34,12 @@ public class ModelPageProcessorTest { } + @TargetUrl(value = "http://webmagic.io/foo/\\d+",sourceRegion = "//li[@class='bar']") + @HelpUrl(value = "http://webmagic.io/bar/\\d+",sourceRegion = "//li[@class='foo']") + public static class MockModel { + + } + @Test public void testMultiModel_should_not_skip_when_match() throws Exception { Page page = new Page(); @@ -48,17 +54,19 @@ public class ModelPageProcessorTest { @Test public void testExtractLinks() throws Exception { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class); - Page page = getMockPage(); + Page page = pageMocker.getMockPage(); modelPageProcessor.process(page); - assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/list/1"), new Request("http://webmagic.io/list/2"), new Request("http://webmagic.io/post/1"), new Request("http://webmagic.io/post/2")); - + assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/bar/3"), new Request("http://webmagic.io/bar/4"), new Request("http://webmagic.io/foo/3"), new Request("http://webmagic.io/foo/4")); } - private Page getMockPage() throws IOException { - Page page = new Page(); - page.setRawText(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); - page.setRequest(new Request("http://webmagic.io/list/0")); - page.setUrl(new PlainText("http://webmagic.io/list/0")); - return page; + @Test + public void testExtractNoLinks() throws Exception { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class); + Page page = pageMocker.getMockPage(); + modelPageProcessor.setExtractLinks(false); + modelPageProcessor.process(page); + assertThat(page.getTargetRequests()).isEmpty(); } + + } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java new file mode 100644 index 0000000000000000000000000000000000000000..45938d6203d857a7c7333e867bcc082ab3d1e751 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午3:23 + */ +public class PageMapperTest { + + private PageMocker pageMocker = new PageMocker(); + + @Test + public void test_get() throws Exception { + PageMapper pageMapper = new PageMapper(GithubRepoApi.class); + GithubRepoApi githubRepo = pageMapper.get(pageMocker.getMockJsonPage()); + assertThat(githubRepo.getName()).isEqualTo("webmagic"); + } + +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java new file mode 100644 index 0000000000000000000000000000000000000000..4b0c133cbd06c7af1144cac5080b620286f4aacb --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.IOException; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:08 + */ +public class PageMocker { + + public Page getMockJsonPage() throws IOException { + Page page = new Page(); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); + page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); + return page; + } + + public Page getMockPage() throws IOException { + Page page = new Page(); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRequest(new Request("http://webmagic.io/list/0")); + page.setUrl(new PlainText("http://webmagic.io/list/0")); + return page; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java new file mode 100644 index 0000000000000000000000000000000000000000..f212628b42a8b25a48ce3ee451a18e79d0acc465 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java @@ -0,0 +1,145 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.time.DateFormatUtils; +import org.apache.commons.lang3.time.DateUtils; +import org.junit.Test; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.Formatter; +import us.codecraft.webmagic.model.formatter.DateFormatter; + +import java.util.Date; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/3 + * Time: 下午9:06 + */ +public class PageModelExtractorTest { + + private PageMocker pageMocker = new PageMocker(); + + public static class ModelDateStr { + + @ExtractBy(value = "//div[@class='date']/text()", notNull = true) + private String dateStr; + + } + + public static class ModelDate { + + @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) + @ExtractBy(value = "//div[@class='date']/text()", notNull = true) + private Date date; + + } + + public static class ModelInt { + + @ExtractBy(value = "//div[@class='number']/text()", notNull = true) + private int number; + + } + + public static class ModelStringList { + + @ExtractBy("//li[@class='list']/a/@href") + private List links; + + } + + public static class ModelIntList { + + @Formatter(subClazz = Integer.class) + @ExtractBy("//li[@class='numbers']/text()") + private List numbers; + + } + + public static class ModelDateList { + + @Formatter(subClazz = Date.class, value = "yyyyMMdd") + @ExtractBy("//li[@class='dates']/text()") + private List dates; + + } + + public static class ModelCustomList { + + @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class) + @ExtractBy("//li[@class='dates']/text()") + private List dates; + + } + + public static class ModelJsonStr { + + @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name") + private String name; + + } + + public static class ModelUrl { + + @ExtractByUrl("https://api\\.github\\.com/repos/\\w+/(\\w+)") + private String name; + + } + + @Test + public void testXpath() throws Exception { + ModelDateStr modelDate = (ModelDateStr) PageModelExtractor.create(ModelDateStr.class).process(pageMocker.getMockPage()); + assertThat(modelDate.dateStr).isEqualTo("20170603"); + } + + @Test + public void testExtractDate() throws Exception { + ModelDate modelDate = (ModelDate) PageModelExtractor.create(ModelDate.class).process(pageMocker.getMockPage()); + assertThat(DateFormatUtils.format(modelDate.date,"yyyyMMdd")).isEqualTo("20170603"); + } + + @Test + public void testExtractInt() throws Exception { + ModelInt modelDate = (ModelInt) PageModelExtractor.create(ModelInt.class).process(pageMocker.getMockPage()); + assertThat(modelDate.number).isEqualTo(12); + } + + @Test + public void testExtractList() throws Exception { + ModelStringList modelDate = (ModelStringList) PageModelExtractor.create(ModelStringList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.links).containsExactly("http://webmagic.io/list/1","http://webmagic.io/list/2","http://webmagic.io/list/3","http://webmagic.io/list/4"); + } + + @Test + public void testExtractIntList() throws Exception { + ModelIntList modelDate = (ModelIntList) PageModelExtractor.create(ModelIntList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.numbers).containsExactly(1,2,3,4); + } + + @Test + public void testExtractDateList() throws Exception { + ModelDateList modelDate = (ModelDateList) PageModelExtractor.create(ModelDateList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd")); + } + + @Test + public void testExtractCustomList() throws Exception { + ModelCustomList modelDate = (ModelCustomList) PageModelExtractor.create(ModelCustomList.class).process(pageMocker.getMockPage()); + assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd")); + } + + @Test + public void testExtractJson() throws Exception { + ModelJsonStr modelDate = (ModelJsonStr) PageModelExtractor.create(ModelJsonStr.class).process(pageMocker.getMockJsonPage()); + assertThat(modelDate.name).isEqualTo("webmagic"); + } + + @Test + public void testExtractByUrl() throws Exception { + ModelUrl modelDate = (ModelUrl) PageModelExtractor.create(ModelUrl.class).process(pageMocker.getMockJsonPage()); + assertThat(modelDate.name).isEqualTo("webmagic"); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 1518763597808ee658554d399ed4da324e5a51da..b4124d2d9d5854de12bfe6dac6723ab2b65ed1cb 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -7,6 +7,8 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmail.com
*/ @@ -37,7 +39,7 @@ public class RedisSchedulerTest { request.putExtra("1","2"); redisScheduler.push(request, task); Request poll = redisScheduler.poll(task); - System.out.println(poll); + assertThat(poll).isEqualTo(request); } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java new file mode 100644 index 0000000000000000000000000000000000000000..ec8486483cd6e4319e0df4db748dfdf91704ce12 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; +import us.codecraft.webmagic.Request; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/6/5 + * Time: 下午5:08 + */ +public class RequestUtilsTest { + + @Test + public void test_generate_range() throws Exception { + List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[1-3]&s=20"); + assertThat(requests).containsExactly(new Request("http://angularjs.cn/api/article/latest?p=1&s=20"), new Request("http://angularjs.cn/api/article/latest?p=2&s=20"), new Request("http://angularjs.cn/api/article/latest?p=3&s=20")); + } + + @Test + public void test_generate_range_when_invalid_number() throws Exception { + List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[10-3]&s=20"); + assertThat(requests).isEmpty(); + } +} diff --git a/webmagic-extension/src/test/resources/html/mock-webmagic.html b/webmagic-extension/src/test/resources/html/mock-webmagic.html index 436e1e0648d7fcba2e4570f99de5995a5af206a9..351ec97624ceb377e3c940f7bb09ad561cd62f67 100644 --- a/webmagic-extension/src/test/resources/html/mock-webmagic.html +++ b/webmagic-extension/src/test/resources/html/mock-webmagic.html @@ -5,18 +5,44 @@ +
20170603
+
12
  • -
  • -
  • +
  • +
  • -
  • -
  • +
  • +
  • +
+
    +
  • +
  • +
  • +
  • +
+
    +
  • +
  • +
  • +
+
    +
  • 1
  • +
  • 2
  • +
  • 3
  • +
  • 4
  • +
+
    +
  • 20170601
  • +
  • 20170602
  • +
  • 20170603
  • +
  • 20170604
  • +
\ No newline at end of file diff --git a/webmagic-extension/src/test/resources/json/mock-githubrepo.json b/webmagic-extension/src/test/resources/json/mock-githubrepo.json new file mode 100644 index 0000000000000000000000000000000000000000..a5037d3d139f1789058960de736f253abeefbd09 --- /dev/null +++ b/webmagic-extension/src/test/resources/json/mock-githubrepo.json @@ -0,0 +1,91 @@ +{ + "id": 9623064, + "name": "webmagic", + "full_name": "code4craft/webmagic", + "owner": { + "login": "code4craft", + "id": 1351884, + "avatar_url": "https://avatars0.githubusercontent.com/u/1351884?v=3", + "gravatar_id": "", + "url": "https://api.github.com/users/code4craft", + "html_url": "https://github.com/code4craft", + "followers_url": "https://api.github.com/users/code4craft/followers", + "following_url": "https://api.github.com/users/code4craft/following{/other_user}", + "gists_url": "https://api.github.com/users/code4craft/gists{/gist_id}", + "starred_url": "https://api.github.com/users/code4craft/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/code4craft/subscriptions", + "organizations_url": "https://api.github.com/users/code4craft/orgs", + "repos_url": "https://api.github.com/users/code4craft/repos", + "events_url": "https://api.github.com/users/code4craft/events{/privacy}", + "received_events_url": "https://api.github.com/users/code4craft/received_events", + "type": "User", + "site_admin": false + }, + "private": false, + "html_url": "https://github.com/code4craft/webmagic", + "description": "A scalable web crawler framework for Java.", + "fork": false, + "url": "https://api.github.com/repos/code4craft/webmagic", + "forks_url": "https://api.github.com/repos/code4craft/webmagic/forks", + "keys_url": "https://api.github.com/repos/code4craft/webmagic/keys{/key_id}", + "collaborators_url": "https://api.github.com/repos/code4craft/webmagic/collaborators{/collaborator}", + "teams_url": "https://api.github.com/repos/code4craft/webmagic/teams", + "hooks_url": "https://api.github.com/repos/code4craft/webmagic/hooks", + "issue_events_url": "https://api.github.com/repos/code4craft/webmagic/issues/events{/number}", + "events_url": "https://api.github.com/repos/code4craft/webmagic/events", + "assignees_url": "https://api.github.com/repos/code4craft/webmagic/assignees{/user}", + "branches_url": "https://api.github.com/repos/code4craft/webmagic/branches{/branch}", + "tags_url": "https://api.github.com/repos/code4craft/webmagic/tags", + "blobs_url": "https://api.github.com/repos/code4craft/webmagic/git/blobs{/sha}", + "git_tags_url": "https://api.github.com/repos/code4craft/webmagic/git/tags{/sha}", + "git_refs_url": "https://api.github.com/repos/code4craft/webmagic/git/refs{/sha}", + "trees_url": "https://api.github.com/repos/code4craft/webmagic/git/trees{/sha}", + "statuses_url": "https://api.github.com/repos/code4craft/webmagic/statuses/{sha}", + "languages_url": "https://api.github.com/repos/code4craft/webmagic/languages", + "stargazers_url": "https://api.github.com/repos/code4craft/webmagic/stargazers", + "contributors_url": "https://api.github.com/repos/code4craft/webmagic/contributors", + "subscribers_url": "https://api.github.com/repos/code4craft/webmagic/subscribers", + "subscription_url": "https://api.github.com/repos/code4craft/webmagic/subscription", + "commits_url": "https://api.github.com/repos/code4craft/webmagic/commits{/sha}", + "git_commits_url": "https://api.github.com/repos/code4craft/webmagic/git/commits{/sha}", + "comments_url": "https://api.github.com/repos/code4craft/webmagic/comments{/number}", + "issue_comment_url": "https://api.github.com/repos/code4craft/webmagic/issues/comments{/number}", + "contents_url": "https://api.github.com/repos/code4craft/webmagic/contents/{+path}", + "compare_url": "https://api.github.com/repos/code4craft/webmagic/compare/{base}...{head}", + "merges_url": "https://api.github.com/repos/code4craft/webmagic/merges", + "archive_url": "https://api.github.com/repos/code4craft/webmagic/{archive_format}{/ref}", + "downloads_url": "https://api.github.com/repos/code4craft/webmagic/downloads", + "issues_url": "https://api.github.com/repos/code4craft/webmagic/issues{/number}", + "pulls_url": "https://api.github.com/repos/code4craft/webmagic/pulls{/number}", + "milestones_url": "https://api.github.com/repos/code4craft/webmagic/milestones{/number}", + "notifications_url": "https://api.github.com/repos/code4craft/webmagic/notifications{?since,all,participating}", + "labels_url": "https://api.github.com/repos/code4craft/webmagic/labels{/name}", + "releases_url": "https://api.github.com/repos/code4craft/webmagic/releases{/id}", + "deployments_url": "https://api.github.com/repos/code4craft/webmagic/deployments", + "created_at": "2013-04-23T12:57:36Z", + "updated_at": "2017-06-03T03:58:13Z", + "pushed_at": "2017-06-03T07:10:15Z", + "git_url": "git://github.com/code4craft/webmagic.git", + "ssh_url": "git@github.com:code4craft/webmagic.git", + "clone_url": "https://github.com/code4craft/webmagic.git", + "svn_url": "https://github.com/code4craft/webmagic", + "homepage": "http://webmagic.io/", + "size": 16982, + "stargazers_count": 4566, + "watchers_count": 4566, + "language": "Java", + "has_issues": true, + "has_projects": true, + "has_downloads": true, + "has_wiki": true, + "has_pages": true, + "forks_count": 2432, + "mirror_url": null, + "open_issues_count": 96, + "forks": 2432, + "open_issues": 96, + "watchers": 4566, + "default_branch": "master", + "network_count": 2432, + "subscribers_count": 618 +} diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a447e39320631db615564697c0c0f7cd189d2cf7..072bb3fd5d1002bd908683a293ef453c48410149 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 1e335393ef9cda36f0f60220a95de3fcba6d964c..95f706ed5beb5f92f03daec44920c3f58736c13b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index cd1ec64c65c4c030b8c9367db7924c0557f4211c..22956cb55ed22e9238d38871eabe8b71c1ac80ec 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,12 +3,15 @@ webmagic-parent us.codecraft - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 us.codecraft webmagic-scripts + + 1.1.2-2 + @@ -16,6 +19,12 @@ jruby 1.7.6 + + org.jetbrains.kotlin + kotlin-stdlib + ${kotlin.version} + + org.codehaus.groovy groovy-all @@ -48,6 +57,7 @@ + ${project.basedir}/src/main/java maven-compiler-plugin @@ -77,6 +87,25 @@ + + org.codehaus.mojo + build-helper-maven-plugin + 3.0.0 + + + add-source + generate-sources + + add-source + + + + ${project.basedir}/src/main/kotlin + + + + + diff --git a/webmagic-scripts/src/main/kotlin/Github.kt b/webmagic-scripts/src/main/kotlin/Github.kt new file mode 100644 index 0000000000000000000000000000000000000000..3d6ca21850d8cfcaf3d13302b0bc11203486f829 --- /dev/null +++ b/webmagic-scripts/src/main/kotlin/Github.kt @@ -0,0 +1,40 @@ + +import us.codecraft.webmagic.Page +import us.codecraft.webmagic.Site +import us.codecraft.webmagic.Spider +import us.codecraft.webmagic.processor.PageProcessor +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor + +/** + * + * @author code4crafter@gmail.com + * Date: 2017/5/31 + * Time: 下午11:33 + * + */ +class GithubRepoPageProcessor : PageProcessor { + + private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000) + + override fun process(page: Page) { + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()) + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all()) + page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString()) + page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString()) + if (page.resultItems.get("name") == null) { + //skip this page + page.setSkip(true) + } + page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()")) + } + + override fun getSite(): Site { + return site + } + + companion object { + @JvmStatic fun main(args: Array) { + Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run() + } + } +} diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index bdc9d8a1cbc6dbdc4130d877e422cc0923a61e92..1cbf59216a812072848e31e4dee7523a408901f9 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.0-SNAPSHOT + 0.7.3 4.0.0 diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 6e350aada6abb3e51f26420fdb9cc3d5e13de070..f45f7e2a8a52ce287138f793ea825fe08f3792fe 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -5,7 +5,6 @@ import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; - import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -13,7 +12,6 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; -import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; @@ -108,8 +106,7 @@ public class SeleniumDownloader implements Downloader, Closeable { String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, - request.getUrl()))); + page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver);