import string import re import pprint from calibre import strftime __license__ = 'GPL v3' __copyright__ = '2012-14, Bernd Leinfelder ' ''' webpaper.nzz.ch ''' from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.recipes import BasicNewsRecipe class Nzz(BasicNewsRecipe): title = 'NZZ Webpaper' __author__ = 'Bernd Leinfelder' description = 'Neue Zuercher Zeitung Webpaper - Erfordert NZZ Digital Abonnement. v20140425' timefmt = ' [%a, %d %b, %Y]' publisher = 'NZZ AG' needs_subscription = True category = 'news, politics, nachrichten, Switzerland' oldest_article = 2 #cover_url = u'http://www.nzz.ch/version-2.34.1/bundles/website/i/png/nzzlogo-print.png' cover_url = u'file:///home/russ/Dropbox/library/nzz/nzzlogo-print.png' max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'de' temp_files = [] extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher } remove_tags = [dict(name='footer') , dict({ 'class' : [ 'sharebox' , 'fullarticle__related' ] })] remove_tags_before = dict(name='article') remove_tags_after= dict(name='footer') def parse_index(self): baseref = 'https://webpaper.nzz.ch'; soup = self.index_to_soup(baseref) # print soup.prettify() articles = {} key = None sections = [] ans = [] issue = soup.find("link",rel="prefetch") soup = self.index_to_soup(baseref+issue['href']) # print soup.prettify() section = "" lastsection = "" pubdate = strftime('%a, %d %b') articlesoup = soup.findAll("article",{ "class" : re.compile(".*fullarticle[ \"].*")}) for art in articlesoup: # print art.prettify() section=art['data-department'] # print "section is "+section if section != lastsection: sections.append(section) articles[section]=[] lastsection=section caption = art.find("h2") self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(""+art.prettify()+"") self.temp_files[-1].close() filename = self.temp_files[-1].name articles[section].append( dict(title=caption.string,url='file://'+filename, date=pubdate, description='', content='')) ans = [(key, articles[key]) for key in sections if articles.has_key(key)] # pprint.pprint(ans) return ans def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://webpaper.nzz.ch/login') br.select_form(nr=0) br['username'] = self.username br['password'] = self.password br.submit() return br