scrapy用来处理数据(Item)的部分叫做Pipeline
当xx_spider.py中yield一个item,将按照settings.ITEM_PIPELINES的顺序保存数据,其中
1 2 3 4 5
| ITEM_PIPELINES = {
'xx.pipelines.DuplicatesPipeline': 2, 'xx.pipelines.MongoPipeline': 3, }
|
这里后面的数字代表优先级,0-1000,按照从小到达执行Pipeline
关键在于pipelines.py文件,直接给示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
|
import pymongo from scrapy.exceptions import DropItem
class FirstPipeline(object):
def process_item(self, item, spider): pass
class DuplicatesPipeline(object):
def __init__(self): self.urls_seen = set()
def process_item(self, item, spider): if item['url'] in self.urls_seen: raise DropItem("Duplicat item found: %s" % item) else: self.urls_seen.add(item['url']) return item
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db
@classmethod def from_crawler(cls, crawler): return cls( mongo_uri = crawler.settings.get('MONGO_URI'), mongo_db = crawler.settings.get('MONGO_DATABASE', 'items') )
def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db]
def close_spider(self, spider): self.client.close()
def process_item(self, item, spider): collection_name = item.__class__.__name__ self.db[collection_name].insert(dict(item)) return item
|