XSpider爬虫框架

XSpider爬虫框架

项目背景

	
main.py:

from xspider.spider.spider import BaseSpider
from xspider.filters import urlfilter
from kuailiyu import KuaiLiYu

if __name__ == "__main__":
	spider = BaseSpider(name = "kuailiyu"  , page_processor = KuaiLiYu() , allow_site = ["kuailiyu.cyzone.cn"] , start_urls = ["http://kuailiyu.cyzone.cn/"])
	spider.url_filters.append(urlfilter.UrlRegxFilter(["kuailiyu.cyzone.cn/article/[0-9]*\.html$","kuailiyu.cyzone.cn/index_[0-9]+.html$"]))
	spider.start()

kuailiyu.py

from xspider import processor 
from xspider.selector import xpath_selector
from xspider import model


class KuaiLiYu(processor.PageProcessor.PageProcessor):


	def __init__(self):
		super(KuaiLiYu , self).__init__()
		self.title_extractor = xpath_selector.XpathSelector(path = "//title/text()")

	def process(self , page , spider):
		items = model.fileds.Fileds()
		items["title"] = self.title_extractor.find(page)
		items["url"] = page.url
		return items
	

参考资料

Fork me on GitHub