“Пример загрузчика скраски” Ответ

Пример загрузчика скраски

def parse_news_metro(self, response):
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        date_selector = response.css('.artikel > div.block-tanggal::text')
        if not date_selector:
            return self.parse_news_pilkada(loader, response)
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.artikel > h1::text')
        if not title_selector:
            return loader.load_item()
        loader.add_value('title', title_selector.extract()[0])

        # Select all p which don't have iframe inside it
        raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
        if not raw_content_selector:
            return loader.load_item()
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()

        # Go to next page while there is next page button
        next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
        if next_page_selector:
            return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))

        loader.add_value('raw_content', raw_content)

        # The author usually put inside <strong> tag, however, some news is not using <strong> tag.
        # NOTE: this block of code may need revision in the future
        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        return loader.load_item()

Nyn

Пример загрузчика скраски

def parse_item(self, response):

        loader = ItemLoader(EolZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
        loader.add_css('name', 'h1#pagetitle::text')
        loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
        loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
        loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
        yield loader.load_item()

Nyn

Пример загрузчика скраски

def parse_book(self, response):
        book_loader = ItemLoader(item=BookItem(), response=response)
        book_loader.default_input_processor = MapCompose(remove_tags)
        book_loader.default_output_processor = TakeFirst()

        book_loader.add_xpath("title", "//div[@class='col-sm-6 product_main']/h1")
        book_loader.add_xpath("price", "//p[@class='price_color']")
        book_loader.add_xpath("upc", "//table[@class='table table-striped']/tr[1]/td")
        book_loader.add_xpath("product_type", "//table[@class='table table-striped']/tr[2]/td")
        book_loader.add_xpath("tax", "//table[@class='table table-striped']/tr[5]/td")
        book_loader.add_xpath("stock", "//table[@class='table table-striped']/tr[6]/td")
        book_loader.add_xpath("reviews", "//table[@class='table table-striped']/tr[7]/td")
        book_loader.add_xpath("rating", "//p[@class='instock availability']/following-sibling::p/@class")
        yield book_loader.load_item()

Nyn

Пример загрузчика скраски

def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
        loader.add_css('name', u'.modTitle>h1::text')

        def parse_category():
            for e in response.css(u'.catType>a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('category', list(parse_category()))
        loader.add_css('detail', u'.zhiyeShow')

        item = loader.load_item()

        return FormRequest(
            url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
            formdata={'code': item['code'][0]},
            meta={'item': item},
            dont_filter=True,
            callback=self.parse_majors
        )

Nyn

Пример загрузчика скраски

def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        loader = ItemLoader(item=News(), response=response)        
        json_response = json.loads(response.body)

        try:
            url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
        except KeyError:
            return loader.load_item()
        loader.add_value('url', url)

        try:
            title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
        except KeyError:
            return loader.load_item()
        if not title:
            return loader.load_item()
        loader.add_value('title', title)

        try: 
            raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p']
        except KeyError:
            return loader.load_item()
        if not raw_content:
            return loader.load_item()
        loader.add_value('raw_content', raw_content)

        try:
            author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author']
        except KeyError:
            return loader.load_item()
        if not author_name:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', author_name)

        try:
            date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated']
        except KeyError:
            return loader.load_item()
        if not date_time_str:
            return loader.load_item()

        date_time_str = date_time_str.split('T')
        date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
        try:
            published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S');
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        return loader.load_item()

Nyn

Пример загрузчика скраски

def parse_item(self, response):

        loader = ItemLoader(ChsiDaxueItem(), response)
        loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
        loader.add_value('url', response.url)
        loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
        loader.add_css('name', u'.topImg::text')
        loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')

        data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
        loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
        loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
        loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)

        def parse_votes():
            xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
            get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
            return {
                'overall': get_vote(u'?????'),
                'environment': get_vote(u'???????'),
                'life': get_vote(u'?????'),
            }

        loader.add_value('votes', parse_votes())

        def parse_trending():
            css = u'{}>table tr:not(:first-child)'
            def get_trending(what):
                majors = []
                for e in response.css(css.format(what)):
                    majors.append({
                        'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
                        'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
                        'vote': float(e.css(u'.avg_rank::text').extract_first()),
                        'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
                    })
                return majors
            return {
                'count': get_trending(u'#topNoofPTable'),
                'index': get_trending(u'#topIndexTable'),
                'like': get_trending(u'.r_r_box_zymyd'),
            }

        loader.add_value('trending', parse_trending())

        item = loader.load_item()

        for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
            yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)

Nyn

Пример загрузчика скраски

def parse_item(self, response):
        """
        Extract fields from the individual email page and load them into the
        item.

        @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
        @returns items 1 1
        @scrapes senderName senderEmail timeSent timeReceived subject body
        @scrapes replyto url
        """

        load = ItemLoader(item=Email(), selector=response)

        # Take care of easy fields first
        load.add_value('url', response.url)

        pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
        pattern_replyto += '/a/@href'
        link = response.xpath(pattern_replyto).extract()
        link = [''] if not link else link

        load.add_value('replyto', link[0])

        # Sometime in 2003, the archive changes and the email pages
        # require specific procedure to extract the following fields:
        specific_fields = {
            'senderName': None,
            'senderEmail': None,
            'timeSent': None,
            'timeReceived': None,
            'subject': None
        }

        # Detect new archive system with HTML comment
        new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')

        if len(new_system) >= 1:
            # If new archive system is detected...
            specific_fields = self.parse_new_system(response, specific_fields)
            body_before_comment = '<!--X-Body-of-Message-->'
            body_after_comment = '<!--X-Body-of-Message-End-->'
        else:
            # Otherwise...
            specific_fields = self.parse_old_system(response, specific_fields)
            body_before_comment = '<!-- body="start" -->'
            body_after_comment = '<!-- body="end" -->'

        # Load all the values from these specific fields
        for key, val in specific_fields.items():
            load.add_value(key, val)

        if self.get_body:
            # Final field, the body of the email
            pattern_body = body_before_comment + '\n?(.*)' + body_after_comment

            # Ignore invalid bytes when necessary
            page_body = response.body.decode('utf-8', 'ignore')
            body = re.search(pattern_body, page_body, flags=re.S)
            load.add_value('body', body.group(1))

        return load.load_item()

Nyn

Пример загрузчика скраски

def parse_item(self, response):
        il = ItemLoader(item=ImageItem(), response=response)
        il.add_css('image_urls', 'img::attr(src)')
        return il.load_item()

Nyn

Пример загрузчика скраски

def parse(self, response):

        for country in response.css(".col-md-4, .country"):
            item = ItemLoader(item=CountryItem(), selector=country)

            item.add_css("country", ".country-name")
            item.add_css("capital", ".country-capital::text")
            item.add_css("population", ".country-population::text")
            item.add_css("area", ".country-area::text")

            yield item.load_item()

Nyn

Пример загрузчика скраски

def parse_song_list(self, response):
        selector = Selector(response)

        song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
        song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
        title = selector.xpath('//title/text()').extract()
        for index, id_ in enumerate(song_id_list):
            l = ItemLoader(item=PlayListItem())
            l.add_value('song_name', song_name_list[index])
            l.add_value('title', title)
            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
                                     headers=self.headers, callback=self.parse_single_song)

Nyn

Ответы похожие на “Пример загрузчика скраски”

Вопросы похожие на “Пример загрузчика скраски”

Смотреть популярные ответы по языку

Смотреть другие языки программирования

Shell/Bash

C++

CSS

HTML

Java

JavaScript

Objective-C

PHP

Python

Sql

Swift

Ruby

TypeScript

Kotlin

Assembly

VBA

Scala

Rust

Dart

Elixir

Clojure

Haskell

Matlab

Erlang

Cobol

Fortran

Scheme

Perl

Groovy

Lua

Julia

Delphi

Abap

Lisp

Prolog

Pascal

ActionScript

Basic

Solidity

PowerShell

GDScript

Excel

“Пример загрузчика скраски” Ответ

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Пример загрузчика скраски

Ответы похожие на “Пример загрузчика скраски”

Вопросы похожие на “Пример загрузчика скраски”

Смотреть популярные ответы по языку

Shell/Bash

C#

C++

C

CSS

HTML

Java

JavaScript

Objective-C

PHP

Python

Sql

Swift

Ruby

TypeScript

Go

Kotlin

Assembly

R

VBA

Scala

Rust

Dart

Elixir

Clojure

Haskell

Matlab

Erlang

Cobol

Fortran

Scheme

Perl

Groovy

Lua

Julia

Delphi

Abap

Lisp

Prolog

Pascal

ActionScript

Basic

Solidity

PowerShell

GDScript

Excel

Смотреть другие языки программирования