1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
|
import urllib2 import Queue import sys import traceback import threading import re import datetime import lxml import chardet import logging import logging.handlers from time import sleep from urlparse import urlparse from lxml import etree from optparse import OptionParser try: from sqlite3 import dbapi2 as sqlite except: from pysqlite2 import dbapi2 as sqlite
''' This script is used to crawl analyzing web! The Feature: 1 可以指定抓取的深度 2 将抓取到的关键字数据存放在sqlite 3 使用logging记录日志 4 并发线程池 Required dependencies: 1 chardet #分析抓取页面的字符集 sudo easy_install chardet Usage: spider.py -u url -d deep -f logfile -l loglevel(1-5) --testself -thread number --dbfile filepath --key=”HTML5” Writer: Dongweiming Date: 2012.10.22 ''' lock = threading.Lock() LOGGER = logging.getLogger('Crawler') LEVELS={ 1:'CRITICAL', 2:'ERROR', 3:'WARNING', 4:'INFO', 5:'DEBUG', } formatter = logging.Formatter('%(name)s %(asctime)s %(levelname)s %(message)s') class mySqlite(object): def __init__(self, path, logger, level): '''初始化数据库连接. from sqlite3 import dbapi2 as sqlite conn = sqlite.connect('testdb') ''' try: self.conn = sqlite.connect(path) self.cur = self.conn.cursor() except Exception, e: myLogger(logger, self.loglevel, e, True) return -1 self.logger = logger self.loglevel = level def create(self, table): '''创建table,我这里创建包含2个段 ID(数字型,自增长),Data(char 128字符)''' try: self.cur.execute("CREATE TABLE IF NOT EXISTS %s(Id INTEGER PRIMARY KEY AUTOINCREMENT, Data VARCHAR(40))"% table) self.done() except sqlite.Error ,e: myLogger(self.logger, self.loglevel, e, True) self.conn.rollback() if self.loglevel 3: myLogger(self.logger, self.loglevel, '创建表%s' % table) def insert(self, table, data): '''插入数据,指定表名,设置data的数据''' try: self.cur.execute("INSERT INTO %s(Data) VALUES('%s')" % (table,data)) self.done() except sqlite.Error ,e: myLogger(self.logger, self.loglevel, e, True) self.conn.rollback() else: if self.loglevel 4: myLogger(self.logger, self.loglevel, '插入数据成功') def done(self): '''事务提交''' self.conn.commit() def close(self): '''关闭连接''' self.cur.close() self.conn.close() if self.loglevel 3: myLogger(self.logger, self.loglevel, '关闭sqlite操作') class Crawler(object): def __init__(self, args, app, table, logger): self.deep = args.depth self.url = args.urlpth self.key = args.key self.logfile = args.logfile self.loglevel = args.loglevel self.dbpth = args.dbpth self.tp = app self.table = table self.logger = logger self.visitedUrl = [] def _hasCrawler(self, url): '''判断是否已经抓取过这个页面''' return (True if url in self.visitedUrl else False) def getPageSource(self, url, key, deep): ''' 抓取页面,分析,入库. ''' headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } if self._hasCrawler(url): return else: self.visitedUrl.append(url) try: request = urllib2.Request(url = url, headers = headers) result = urllib2.urlopen(request).read() except urllib2.HTTPError, e: myLogger(self.logger, self.loglevel, e, True) return -1 try: encoding = chardet.detect(result)['encoding'] if encoding.lower() == 'gb2312': encoding = 'gbk' if encoding.lower() != 'utf-8': result = result.decode(encoding) except Exception, e: myLogger(self.logger, self.loglevel, e, True) return -1 else: if self.loglevel 3: myLogger(self.logger, self.loglevel, '抓取网页 %s 成功' % url) try: self._xpath(url, result, ['a'], unicode(key, 'utf8'), deep) self._xpath(url, result, ['title', 'p', 'li', 'div'], unicode(key, "utf8"), deep) except TypeError: self._xpath(url, result, ['a'], key, deep) self._xpath(url, result, ['title', 'p', 'li', 'div'], key, deep) except Exception, e: myLogger(self.logger, self.loglevel, e, True) return -1 else: if self.loglevel 3: myLogger(self.logger, self.loglevel, '分析网页 %s 成功' % url) return True def _xpath(self, weburl, data, xpath, key, deep): sq = mySqlite(self.dbpth, self.logger, self.loglevel) page = etree.HTML(data) for i in xpath: hrefs = page.xpath(u"//%s" % i) if deep 1: for href in hrefs: url = href.attrib.get('href','') if not url.startswith('java') and not url.startswith('mailto'): self.tp.add_job(self.getPageSource,url, key, deep-1) for href in hrefs: value = href.text if value: m = re.compile(r'.*%s.*' % key).match(value) if m: sq.insert(self.table, m.group().strip()) sq.close() def work(self): '''主方法调用. import datetime logger = configLogger('test.log') time = datetime.datetime.now().strftime("%m%d%H%M%S") sq = mySqlite('test.db', logger, 1) table = 'd' + str(time) sq.create(table) tp = ThreadPool(5) def t():pass t.depth=1 t.urlpth='http://www.baidu.com' t.logfile = 'test.log' t.loglevel = 1 t.dbpth = 'test.db' t.key = 'test' d = Crawler(t, tp, table, logger) d.getPageSource(t.urlpth, t.key, t.depth) ''' if not self.url.startswith('http://'): self.url = 'http://' + self.url self.tp.add_job(self.getPageSource, self.url, self.key, self.deep) self.tp.wait_for_complete() class MyThread(threading.Thread): def __init__(self, workQueue, timeout=30, **kwargs): threading.Thread.__init__(self, kwargs=kwargs) self.timeout = timeout self.setDaemon(True) self.workQueue = workQueue self.start() def run(self): '''重载run方法''' while True: try: lock.acquire() callable, args = self.workQueue.get(timeout=self.timeout) res = callable(*args) lock.release() except Queue.Empty: break except Exception, e: myLogger(self.logger, self.loglevel, e, True) return -1 class ThreadPool(object): def __init__(self, num_of_threads): self.workQueue = Queue.Queue() self.threads = [] self.__createThreadPool(num_of_threads) def __createThreadPool(self, num_of_threads): for i in range(num_of_threads): thread = MyThread(self.workQueue) self.threads.append(thread) def wait_for_complete(self): '''等待所有线程完成''' while len(self.threads): thread = self.threads.pop() if thread.isAlive(): thread.join() def add_job( self, callable, *args): '''增加任务,放到队列里面''' self.workQueue.put((callable, args)) def configLogger(logfile): '''配置日志文件和记录等级''' try: handler = logging.handlers.RotatingFileHandler(logfile, maxBytes=10240000, backupCount=5, ) except IOError, e: print e return -1 else: handler.setFormatter(formatter) LOGGER.addHandler(handler) logging.basicConfig(level=logging.NOTSET) return LOGGER def myLogger(logger, lv, mes, err=False): '''记录日志函数''' getattr(logger, LEVELS.get(lv, 'WARNING').lower())(mes) if err: getattr(logger, LEVELS.get(lv, 'WARNING').lower())(traceback.format_exc()) def parse(): parser = OptionParser( description="This script is used to crawl analyzing web!") parser.add_option("-u", "--url", dest="urlpth", action="store", help="Path you want to fetch", metavar="www.sina.com.cn") parser.add_option("-d", "--deep", dest="depth", action="store",type="int", help="Url path's deep, default 1", default=1) parser.add_option("-k", "--key", dest="key", action="store", help="You want to query keywords, For example 'test'") parser.add_option("-f", "--file", dest="logfile", action="store", help="Record log file path and name, default spider.log", default='spider.log') parser.add_option("-l", "--level", dest="loglevel", action = "store", type="int",help="Log file level, default 1(CRITICAL)", default=1) parser.add_option("-t", "--thread", dest="thread", action="store", type="int",help="Specify the thread pool, default 10", default=10) parser.add_option("-q", "--dbfile", dest="dbpth", action="store", help="Specify the the sqlite file directory and name, default test.db", metavar='test.db') parser.add_option("-s", "--testself", dest="testself", action="store_true", help="Test myself", default=False) (options, args) = parser.parse_args() return options def main(): '''主函数''' options = parse() if options.testself: import doctest print doctest.testmod() return if not options.urlpth or not options.key or not options.dbpth: print 'Need to specify the parameters option "-u " or "-k" or "-q"!' return if '-h' in sys.argv or '--help' in sys.argv: print __doc__ logger = configLogger(options.logfile) time = datetime.datetime.now().strftime("%m%d%H%M%S") tp = ThreadPool(options.thread) sq = mySqlite(options.dbpth, logger, options.loglevel) table = 'd' + str(time) sq.create(table) sq.close() crawler = Crawler(options, tp, table, logger) crawler.work() if __name__ == '__main__': main()</pre>
|