1 | initial version |
There are several ways to remove duplicate items from a Scrapy pipeline:
Example pipeline code:
class RemoveDuplicatesPipeline(object):
def __init__(self):
self.seen = set()
def process_item(self, item, spider):
if item['key'] not in self.seen:
self.seen.add(item['key'])
return item
else:
raise DropItem('Duplicate item found: %s' % item)
Example pipeline code:
import sqlite3
class RemoveDuplicatesPipeline(object):
def __init__(self):
self.conn = sqlite3.connect('database.db')
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS seen_items (id INTEGER PRIMARY KEY AUTOINCREMENT, key TEXT)')
self.conn.commit()
def process_item(self, item, spider):
self.cursor.execute("SELECT * FROM seen_items WHERE key=?", (item['key'],))
if not self.cursor.fetchone():
self.cursor.execute("INSERT INTO seen_items (key) VALUES (?)", (item['key'],))
self.conn.commit()
return item
else:
raise DropItem('Duplicate item found: %s' % item)
DUPEFILTER_CLASS
setting in your Scrapy settings file, the framework will automatically remove duplicate requests and items. You can choose from several built-in classes or create your own. Example settings code:
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
# or
DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
# or
class CustomDupeFilter(object):
def __init__(self):
self.seen = set()
def request_seen(self, request):
if request.url in self.seen:
return True
else:
self.seen.add(request.url)
def open(self): pass
def close(self, reason): pass
DUPEFILTER_CLASS = 'myproject.middlewares.CustomDupeFilter'