scrapy递归爬虫问题
澳门新葡京
澳门新葡京
当前位置 : 澳门新葡京 > 澳门新葡京网址

scrapy递归爬虫问题

sipder如下:
`#-"coding:utf-8 -"-
import sys, os
reloadsys

from scrapy.spiders import Spider
from scrapy.spiders import Request
from scrapy.selector import Selector
from network.items import WandoujiawangyeItem

base = "D:/python_workspace/datasets/"

class WandoujiawangyeSpiderSpider:

name = "wandoujiawangye"
download_delay = 1
allowed_domains = ["wandoujia.com"]  
start_urls = [  
   "http://www.wandoujia.com/category/app" ,
   "http://www.wandoujia.com/category/game"          
]  

def parseself, response:
    items =[]
    sel = Selectorresponse
    big_urls = sel.xpath//li[@class=\"parent-cate\"]/a/@href.extract  
    big_titles = sel.xpath//li[@class=\"parent-cate\"]/a/text.extract

    second_urls = sel.xpath//li[@class=\"child-cate\"]/a/@href.extract
    second_titles = sel.xpath//li[@class=\"child-cate\"]/a/text.extract
      
    for i in range0,lenbig_titles-1:
        file_name = base +big_titles[i]
  #创建目录
        ifnot os.path.existsfile_name:
            os.makedirsfile_name
        for j in range0,lensecond_titles:
            item =WandoujiawangyeItem                
            item[parent_url] = big_urls[i]
            item[parent_title] = big_titles[i]
            if_belong = second_urls[j].startswithitem[parent_url]
            ifif_belong:
                second_file_name =file_name + / +second_titles[j]
                ifnot os.path.extistssecond_file_name:
                    os.makedirssecond_file_name
                item[second_url] = second_urls[j]
                item[second_title] = second_titles[j]
                item[path] = second_file_name
                items.appenditem
    for item in items:
        yield Requesturl=item[second_url],meta={item_1:item},callbck=self.second_parse,dont_filter=True
 #对于返回的小类的url,再进行递归请求
def second_parseself,response:
    sel = Selectorresponse
    item_1 = response.meta[item_1]
    items = []
    bigUrls = sel.xpath//a/@href.extract   

    for i in range0,lenbigUrls:
        if_belong = bigUrls[i].endswith.shtml and bigUrls[i].startswithitem_1[parent_url]
        ifif_belong:
            item = WandoujiawangyeItem
            item[parent_title]=item_1[parent_title]
            item[parent_url]=item_1[parent_url]
            item[second_url]=item_1[second_url]
            item[second_title]=item_1[second_title]
            item[path]=item_1[path]
            item[link_url]=bigUrls[i]
            items.appenditem
    for item in items:
        yield Requesturl=item[link_url],meta={item_2:item},callback=self.detail_parse,dont_filter=True
def detail_parseself,response:
    sel=Selectorresponse
    item=response.mata[item_2]
    content=""                  
    head=sel.xpath//span[@class=\"title\"]/text         
    content_list=sel.xpath//div[@class=\"desc-info\"]/div/text 
    for content_one in content_list:
        content+=content_one   
    item[head]=head              
    item[content]=content     
    yield item
        

为什么我只爬得出big_titles出来?好像不能和第二级建立连接,second_titles等都出不来,哪里有问题吗?在线等,求救!

`

栏目列表

广告位

澳门新葡京