爬取
抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。
创建项目
scrapy startproject China
scrapy genspider -t crawl chinatech
items.py
1 from scrapy import Field, Item 2 3 4 class ChinaItem(Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 8 title = Field() 9 text = Field()10 datetime = Field()11 source = Field()12 url = Field()13 website = Field()
chinatech.py
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from China.items import * 5 from China.loaders import * 6 7 class ChinatechSpider(CrawlSpider): 8 name = 'chinatech' 9 allowed_domains = ['tech.china.com']10 start_urls = ['http://tech.china.com/articles/']11 12 rules = (13 Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),14 callback='parse_item'),15 Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))16 )17 18 def parse_item(self, response):19 loader = ChinaLoader(item=ChinaItem(), response=response)20 loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')21 loader.add_value('url', response.url)22 loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')23 loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')24 loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')25 loader.add_value('website', '中华网')26 yield loader.load_item()
loads.py
1 from scrapy.loader import ItemLoader 2 from scrapy.loader.processors import TakeFirst, Join, Compose 3 4 5 class NewsLoader(ItemLoader): 6 default_output_processor = TakeFirst() 7 8 9 class ChinaLoader(NewsLoader):10 text_out = Compose(Join(), lambda s: s.strip())11 source_out = Compose(Join(), lambda s: s.strip())
pipelines.py
1 import json 2 3 class ChinaPipeline(object): 4 5 def __init__(self): 6 self.filename = open("china.json", "w") 7 8 def process_item(self, item, spider): 9 text = json.dumps(dict(item), ensure_ascii = False) + ",\n"10 self.filename.write(text)11 return item12 13 def close_spider(self, spider):14 self.filename.close()
settings.py
1 BOT_NAME = 'China' 2 3 SPIDER_MODULES = ['China.spiders'] 4 NEWSPIDER_MODULE = 'China.spiders' 5 6 ROBOTSTXT_OBEY = False 7 8 ITEM_PIPELINES = { 9 'China.pipelines.ChinaPipeline': 300,10 }