Source code for soupstars.examples.nytimes

"""
NYTimes
~~~~~~~

Extract article links and article metadata from nytimes.com

"""

import re
import sys

from soupstars import Parser, serialize


if sys.version_info.major >= 3:
    import urllib
    urlparse = urllib.parse.urlparse
else:
    import urlparse
    urlparse = urlparse.urlparse


[docs]class NytimesArticleParser(Parser):
    """
    Parse attributes from a NY times article.
    
    >>> from soupstars.examples.nytimes import NytimesArticleParser

    """

[docs]    @serialize
    def title(self):
        """
        The title of the article.
        """

        return self.h1.text

[docs]    @serialize
    def author(self):
        """
        The author(s) of the article.
        """

        return self.find(attrs={'itemprop': 'author creator'}).text
        

[docs]class NytimesLinkParser(Parser):
    """
    Parse the links from a NY times webpage.

    :param str url: The webpage to parse

    >>> from soupstars.examples.nytimes import NytimesLinkParser
    """

    host = "www.nytimes.com"
    article_link_regex = re.compile(r'^/\d{4}/\d{2}/\d{2}/')

    def valid_nytimes_link(self, parsed_url):
        return parsed_url.netloc == self.host

    def valid_article_link(self, parsed_url):
        return re.match(self.article_link_regex, parsed_url.path) is not None

    def format_nytimes_link(self, parsed_url):
        return "{scheme}://{netloc}{path}".format(scheme=parsed_url.scheme,
                                                  netloc=parsed_url.netloc,
                                                  path=parsed_url.path)

    def nytimes_links(self):
        result = set()
        for tag in self.find_all('a'):
            url = tag.get('href', '')
            parsed_url = urlparse(url)
            if self.valid_nytimes_link(parsed_url):
                result.add(url)
        else:
            return list(result)
    
[docs]    @serialize
    def article_links(self):
        """
        A list of links that point to NYTimes articles
        """

        result = set()
        for url in self.nytimes_links():
            parsed_url = urlparse(url)
            if self.valid_article_link(parsed_url):
                result.add(self.format_nytimes_link(parsed_url))
        else:
            return result
            
[docs]    @serialize
    def non_article_links(self):
        """
        A list of links that points to NYTimes pages that are not articles.
        """
        
        result = set()
        for url in self.nytimes_links():
            parsed_url = urlparse(url)
            if not self.valid_article_link(parsed_url):
                result.add(self.format_nytimes_link(parsed_url))
        else:
            return result
Source code for soupstars.examples.nytimes

soupstars

Navigation

Related Topics