mirror of
https://github.com/wallabag/wallabag.git
synced 2024-12-18 21:56:29 +00:00
[add] new specific configuration files
This commit is contained in:
parent
d5501950e2
commit
ac4d114214
773 changed files with 6982 additions and 0 deletions
6
inc/3rdparty/site_config/standard/37signals.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/37signals.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //div[@class='post_header']//h2/a
|
||||
author: //span[@class='author']
|
||||
date: //span[@class='date']
|
||||
body: //div[@id='Content']
|
||||
|
||||
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department
|
9
inc/3rdparty/site_config/standard/3quarksdaily.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/3quarksdaily.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
body: //div[@class='content']
|
||||
date: //div[@class='content']/h2
|
||||
strip: //div[@class='content']/h2
|
||||
title: //div[@class='content']/h3
|
||||
|
||||
strip: //div[@id='postmenu']
|
||||
strip: //div[@class='trackback']
|
||||
tidy: no
|
||||
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html
|
11
inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
body: //div[@id='main']
|
||||
title: //div[@class='intro']/h1
|
||||
author: //ul[@class='text-data']/li[@class='author']
|
||||
date: //ul[@class='text-data']/li[@class='date']
|
||||
convert_double_br_tags: yes
|
||||
tidy: no
|
||||
|
||||
strip: //div[@class='share']
|
||||
strip: //*[@class='zoom']
|
||||
strip: //div[@id='disqus_thread']
|
||||
test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html
|
4
inc/3rdparty/site_config/standard/43folders.com.txt
vendored
Normal file
4
inc/3rdparty/site_config/standard/43folders.com.txt
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
body: //*[@class = 'content']
|
||||
author: //*[@class = 'submitted']/a
|
||||
date: substring-after(//*[@class = 'submitted']/text(), '|')
|
||||
test_url: http://www.43folders.com/2011/04/22/cranking
|
27
inc/3rdparty/site_config/standard/500px.com.txt
vendored
Normal file
27
inc/3rdparty/site_config/standard/500px.com.txt
vendored
Normal file
|
@ -0,0 +1,27 @@
|
|||
# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
|
||||
# photo page example: http://500px.com/photo/4181666
|
||||
# blog page example: http://500px.com/blog/110
|
||||
|
||||
# avoid "no text" error
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
# reorganize photo page elements
|
||||
#body://div[contains(@class,'container')]
|
||||
move_into(body)://div[contains(@id,'thephoto')]
|
||||
move_into(body)://div[contains(@id,'description')]
|
||||
move_into(body)://div[contains(@id,'tags')]
|
||||
move_into(body)://div[contains(@id,'photo-info')]
|
||||
|
||||
# clean photo page info
|
||||
strip://span[contains(@id,'copyright')]
|
||||
strip://*[contains(@id,'store')]
|
||||
strip://*[contains(@id,'user-info')]
|
||||
strip://*[contains(@id,'photo-stats')]
|
||||
strip://*[contains(@id,'voting_controls_container')]
|
||||
strip://*[contains(@id,'more-photos')]
|
||||
strip://*[contains(@id,'embed-photo')]
|
||||
|
||||
# clean blog page side bar
|
||||
strip://*[contains(@class,'col d3 clearafter')]
|
||||
test_url: http://500px.com/photo/3641041?from=editors
|
2
inc/3rdparty/site_config/standard/512pixels.net.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/512pixels.net.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
title: substring-before(//title, '—')
|
||||
test_url: http://512pixels.net/more-on-linked-lists/
|
9
inc/3rdparty/site_config/standard/5by5.tv.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/5by5.tv.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
body: //*[@id="episode"]
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
autodetect_next_page: no
|
||||
strip_id_or_class: player
|
||||
|
||||
strip://*[@id="header"]
|
||||
test_url: http://5by5.tv/buildanalyze/60
|
9
inc/3rdparty/site_config/standard/944.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/944.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
title: //h2[@class='border']
|
||||
body: //div[@class='padding']
|
||||
|
||||
convert_double_br_tags: yes
|
||||
|
||||
strip: //div[@id='social_sharing']
|
||||
strip: //div[@class='socialLinks']
|
||||
|
||||
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/
|
10
inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757
|
10
inc/3rdparty/site_config/standard/aachener-zeitung.de.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/aachener-zeitung.de.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718
|
7
inc/3rdparty/site_config/standard/abc.es.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/abc.es.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title: //meta[@property='og:title']/@content
|
||||
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text']
|
||||
strip_id_or_class: colB
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html
|
10
inc/3rdparty/site_config/standard/abc.net.au.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/abc.net.au.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //h1
|
||||
author: //div[@class="byline"]/a
|
||||
date: //span[@class="timestamp"]
|
||||
|
||||
strip: //p[@class="topics"]
|
||||
strip: //h1
|
||||
strip: //div[@class="byline"]
|
||||
strip: //p[@class="published"]
|
||||
strip: //div[contains(@class,"featured-scroller")]
|
||||
test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544
|
27
inc/3rdparty/site_config/standard/abcnews.go.com.txt
vendored
Normal file
27
inc/3rdparty/site_config/standard/abcnews.go.com.txt
vendored
Normal file
|
@ -0,0 +1,27 @@
|
|||
title: //h1[@class='headline']
|
||||
body: //div[@id='storyText']
|
||||
# for video entries
|
||||
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
|
||||
author: //div[@class='byline']
|
||||
date: //div[@class='date']
|
||||
strip: //*[@id='date_partner']
|
||||
|
||||
strip: //div[@class='breadcrumb']
|
||||
strip: //div[contains(@class,'show_tools')]
|
||||
strip: //div[@id='sponsoredByAd']
|
||||
strip: //div[contains(@class,'rel_container')]
|
||||
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
|
||||
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
|
||||
strip: //p[contains(., 'Click here to return to')]
|
||||
#strip_id_or_class: media
|
||||
strip_id_or_class: mediaplayer
|
||||
|
||||
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
|
||||
|
||||
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
|
||||
# multi-page
|
||||
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544
|
9
inc/3rdparty/site_config/standard/accesstoinsight.org.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/accesstoinsight.org.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
title: //div[@id='H_docTitle']
|
||||
|
||||
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
|
||||
|
||||
strip_id_or_class: F_toenail
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html
|
3
inc/3rdparty/site_config/standard/acidcow.com.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/acidcow.com.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
body: //div[starts-with(@id, 'news-id-')]
|
||||
|
||||
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html
|
9
inc/3rdparty/site_config/standard/acquia.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/acquia.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
title://h1[@class="title"]
|
||||
author://div[@class="submitted"]/span/a
|
||||
date://div[@class="submitted"]/span
|
||||
body://div[@class="content-wrapper"]
|
||||
|
||||
strip://div[@id="skip-link"]
|
||||
strip://div[@id="region-content-3-3"]
|
||||
strip://div[@id="section-footer"]
|
||||
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code
|
5
inc/3rdparty/site_config/standard/acroswing.fr.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/acroswing.fr.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
tidy:no
|
||||
date: //time[@class='updated']
|
||||
dissolve: //ul[@class='video-gallery']/li
|
||||
dissolve: //ul[@class='video-gallery']
|
||||
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php
|
15
inc/3rdparty/site_config/standard/aht.seriouseats.com.txt
vendored
Normal file
15
inc/3rdparty/site_config/standard/aht.seriouseats.com.txt
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
body: //div[@id='content']
|
||||
|
||||
# clean up recipe pages
|
||||
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
|
||||
|
||||
#recipe pages
|
||||
strip_id_or_class: "recipe-feedback"
|
||||
strip_id_or_class: "comments"
|
||||
strip_id_or_class: "procedure-number"
|
||||
strip_id_or_class: "more-with-author"
|
||||
|
||||
#slice
|
||||
strip_id_or_class: "inner"
|
||||
|
||||
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html
|
2
inc/3rdparty/site_config/standard/alex.mullr.net.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/alex.mullr.net.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //div[@class="entry"]
|
||||
test_url: http://alex.mullr.net/blog/2011/05/on-spotify/
|
12
inc/3rdparty/site_config/standard/alistapart.com.txt
vendored
Normal file
12
inc/3rdparty/site_config/standard/alistapart.com.txt
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
title: //h1[@class='title']
|
||||
author: //h3[@class='byline']/a
|
||||
date: //div[@class='ishinfo']
|
||||
|
||||
body: //*[@id='articletext']
|
||||
strip_id_or_class: 'ishinfo'
|
||||
strip_id_or_class: 'metastuff'
|
||||
strip_id_or_class: 'learnmore'
|
||||
strip_id_or_class: 'discuss'
|
||||
|
||||
prune: no
|
||||
test_url: http://www.alistapart.com/articles/organizing-mobile/
|
8
inc/3rdparty/site_config/standard/aljazeera.com.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/aljazeera.com.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
title: //span[@id='DetailedTitle']
|
||||
body: //td[@id='tdTextContent']
|
||||
strip_id_or_class: Skyscrapper_Body
|
||||
date: //span[@id='ctl00_cphBody_lblDate']
|
||||
author: //div[@id="dvAuthorInfo"]//a/text()
|
||||
strip: //table[ tbody/tr/td/object ]
|
||||
prune: no
|
||||
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html
|
14
inc/3rdparty/site_config/standard/allrecipes.com.txt
vendored
Normal file
14
inc/3rdparty/site_config/standard/allrecipes.com.txt
vendored
Normal file
|
@ -0,0 +1,14 @@
|
|||
title: //h1[@id='itemTitle']
|
||||
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
|
||||
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
|
||||
strip: //div[contains(@class, 'rightcoltoolsdiv')]
|
||||
strip: //div[contains(@class, 'servings-form')]
|
||||
strip: //p[@class='nutritional-information']
|
||||
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
|
||||
strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
|
||||
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
|
||||
strip_id_or_class: eshaAttribute
|
||||
strip_id_or_class: eshaParagraph
|
||||
prune: no
|
||||
|
||||
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd
|
10
inc/3rdparty/site_config/standard/allthingsd.com.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/allthingsd.com.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title://div[@class="article-title"]/h1[@class="title"]
|
||||
date: //p[@class="article-date"]
|
||||
body://*[@class="article-body article-text"]
|
||||
# Trim out related posts at bottom of article
|
||||
strip://blockquote[@class="memo"]
|
||||
|
||||
# Yup, no idea why author won't work...
|
||||
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
|
||||
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
|
||||
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
|
8
inc/3rdparty/site_config/standard/allyou.com.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/allyou.com.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
title: //div[@id='pageHdr']//h1
|
||||
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
|
||||
strip: //div[contains(@class, 'infoBox') or @id='infoBox']
|
||||
single_page_link: //li[@id='print']/a
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/
|
11
inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
body: //div[@class = 'entry']
|
||||
date: substring-after(//p[@class="date"],'بتاريخ ')
|
||||
strip_id_or_class: date
|
||||
strip_id_or_class: follow-single
|
||||
strip_id_or_class: ratingblock
|
||||
strip_id_or_class: newRatingHolder
|
||||
strip_id_or_class: postmetadata
|
||||
strip_id_or_class: addthis_toolbox
|
||||
strip_id_or_class: addthis_default_style
|
||||
strip_id_or_class: size-full
|
||||
test_url: http://alphabeta.argaam.com/?p=35657
|
9
inc/3rdparty/site_config/standard/alriyadh.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/alriyadh.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
body: //div[@id = "article-view"]
|
||||
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
|
||||
author: //p[@class = "author"]
|
||||
strip: //h1
|
||||
strip: //h2
|
||||
strip_id_or_class: author
|
||||
prune: no
|
||||
test_url: http://www.alriyadh.com/2011/10/10/article674357.html
|
||||
test_url: http://www.alriyadh.com/net/article/780935
|
2
inc/3rdparty/site_config/standard/alseraj.net.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/alseraj.net.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
title: //*[@id='normalfontyellow']
|
||||
test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2
|
2
inc/3rdparty/site_config/standard/alt1040.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/alt1040.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //*[(@class = "historia")]
|
||||
test_url: http://alt1040.com/2011/09/banda-ancha-en-america-latina-insignificante
|
2
inc/3rdparty/site_config/standard/altfoto.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/altfoto.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //*[(@class = "historia")]
|
||||
test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras
|
10
inc/3rdparty/site_config/standard/alumni.stanford.edu.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/alumni.stanford.edu.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //h1
|
||||
|
||||
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
|
||||
|
||||
date: //div/a[contains (@href, "issue")]
|
||||
|
||||
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
|
||||
|
||||
body: //div[@class="enableBullets"]
|
||||
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819
|
19
inc/3rdparty/site_config/standard/amazon.com.txt
vendored
Normal file
19
inc/3rdparty/site_config/standard/amazon.com.txt
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
title: //span[@id = 'btAsinTitle']
|
||||
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
|
||||
#strip_id_or_class: quantityDropdownDiv
|
||||
#strip_id_or_class: addToCartSpan
|
||||
#strip_id_or_class: oneClickDiv
|
||||
strip_id_or_class: nocontent
|
||||
strip_id_or_class: masDynamicConten
|
||||
strip_id_or_class: dynamic-content
|
||||
prune: no
|
||||
|
||||
find_string: <span id="actualPriceValue">
|
||||
replace_string: <span id="actualPriceValue"><br />Price:
|
||||
|
||||
strip_id_or_class: collapsePS
|
||||
strip_id_or_class: expandPS
|
||||
strip_id_or_class: psPlaceHolde
|
||||
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
|
||||
|
||||
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/
|
6
inc/3rdparty/site_config/standard/americandrink.net.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/americandrink.net.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //div[@class='head']/h2/a
|
||||
author: //div[@class='head']/a
|
||||
date: //div[@class='head']/p[@class='date']/a
|
||||
body: //div[@class='copy']
|
||||
strip: //p[@class='meta']
|
||||
test_url: http://americandrink.net/post/10567188712/free-the-hooch
|
10
inc/3rdparty/site_config/standard/americascup.com.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/americascup.com.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //div[@class="editorial-content"]/h3
|
||||
body: //div[@class="hero-image" or @class="editorial-content"]
|
||||
|
||||
strip: //ul[@class="hero-caption"]
|
||||
strip_id_or_class: footer
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/
|
5
inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
title: //h1[@class="post-title"]
|
||||
author: //span[@class="author"]/a
|
||||
date: //span[@class="date"]
|
||||
body: //div[@class="post-content main"]
|
||||
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/
|
11
inc/3rdparty/site_config/standard/anandtech.com.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/anandtech.com.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
author: //a[@class='b'][1]
|
||||
date: substring-after(substring-before(//div, 'Posted in'), ' on ')
|
||||
strip_image_src: /content/images/globals/
|
||||
strip: //h2[. = 'Page 1']/preceding::p
|
||||
strip: //h2
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
|
||||
|
||||
test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/
|
9
inc/3rdparty/site_config/standard/andyrutledge.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/andyrutledge.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
title: //h2
|
||||
author: string('Andy Rutledge')
|
||||
date: //div[@class='articledate']
|
||||
body: //div[@class='copybody']
|
||||
|
||||
strip: //*[@class='space']
|
||||
strip: //*[@class='articleFoot']
|
||||
|
||||
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php
|
9
inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
title: //h1[@class="title"]
|
||||
|
||||
author: ("Anna Manasova")
|
||||
# is ignored, unfortunately
|
||||
|
||||
date: //p[@class="date"]
|
||||
|
||||
body: //div[@class="entry"]
|
||||
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/
|
18
inc/3rdparty/site_config/standard/applature.com.txt
vendored
Normal file
18
inc/3rdparty/site_config/standard/applature.com.txt
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
title: //h1[contains(@class, 'title')#
|
||||
body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
|
||||
date: //div[@class='date']
|
||||
|
||||
strip_id_or_class: sharethis
|
||||
strip_id_or_class: stats
|
||||
strip_id_or_class: apply_form
|
||||
strip_id_or_class: job_map
|
||||
strip_id_or_class: respond
|
||||
strip: //h1//span[@class='type']
|
||||
strip: //li[@class='print' or @class='map']
|
||||
|
||||
replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/
|
7
inc/3rdparty/site_config/standard/apple.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/apple.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
strip: //p[@class='sosumi']
|
||||
# Aren't they witty?
|
||||
|
||||
# I can't work out what causes the before the title.
|
||||
title: //h1[@class='title']
|
||||
strip: //h1[@class='title']
|
||||
test_url: http://www.apple.com/pr/library/2011/02/15appstore.html
|
11
inc/3rdparty/site_config/standard/appleinsider.com.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/appleinsider.com.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
title: //p[@class='title']
|
||||
|
||||
author: //p[text() = 'By ']/a/text()
|
||||
strip: //p[text() = 'By ']
|
||||
|
||||
body: //td[@class='bod']
|
||||
strip_id_or_class: title
|
||||
strip_id_or_class: minor
|
||||
|
||||
strip_id_or_class: multipagefooter
|
||||
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
|
2
inc/3rdparty/site_config/standard/appleweblog.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/appleweblog.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //*[(@class = "historia")]
|
||||
test_url: http://appleweblog.com/2011/09/encontrada-vulnerabilidad-grave-en-skype-para-ios
|
5
inc/3rdparty/site_config/standard/archdaily.com.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/archdaily.com.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
date: //div[@class='post_date']
|
||||
|
||||
body: //div[@class='post_content']
|
||||
|
||||
test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up
|
18
inc/3rdparty/site_config/standard/archiveofourown.org.txt
vendored
Normal file
18
inc/3rdparty/site_config/standard/archiveofourown.org.txt
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
|
||||
# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
|
||||
# Exclude: header, footer, navigation, comments.
|
||||
# Notes: User is a newbie with XPaths.
|
||||
|
||||
title: //h2[@class='title']
|
||||
author: //h3[@class='byline']
|
||||
author: //a[@class='login author']
|
||||
|
||||
strip_id_or_class:header
|
||||
strip_id_or_class:navigation
|
||||
strip_id_or_class:feedback
|
||||
strip_id_or_class:kudos
|
||||
strip_id_or_class:add_comment_placeholder
|
||||
strip_id_or_class:add_comment
|
||||
strip_id_or_class:globalize
|
||||
strip_id_or_class:footer
|
||||
test_url: http://archiveofourown.org/works/229402?view_full_work=true
|
16
inc/3rdparty/site_config/standard/arstechnica.com.txt
vendored
Normal file
16
inc/3rdparty/site_config/standard/arstechnica.com.txt
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
author: //p[@class='byline']/a
|
||||
body: //div[contains(@class,'article-content')]
|
||||
strip: //h2[@class='title']
|
||||
strip_id_or_class: byline
|
||||
prune: no
|
||||
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr
|
||||
|
||||
title: //div[@id='story']//h2[@class='title']
|
||||
|
||||
strip: //div[@class='pager']
|
||||
next_page_link: //nav//a[span/@class='next']/@href
|
||||
|
||||
test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
|
||||
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
|
6
inc/3rdparty/site_config/standard/articles.boston.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/articles.boston.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
|
||||
author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
|
||||
date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
|
||||
strip_id_or_class: mod-pagination
|
||||
test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park
|
11
inc/3rdparty/site_config/standard/articles.courant.com.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/articles.courant.com.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
|
||||
date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
|
||||
|
||||
strip_id_or_class: mod-article-byline
|
||||
strip_id_or_class: mod-article-header
|
||||
strip_id_or_class: mod-article-subtitle
|
||||
#This leaves some crud after the article, but it's better than nothing.
|
||||
#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
|
||||
|
||||
test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown
|
3
inc/3rdparty/site_config/standard/asahi.com.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/asahi.com.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
body: //div[@id='HeadLine']
|
||||
strip: //div[@id='utility_right']
|
||||
test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html
|
5
inc/3rdparty/site_config/standard/ascarter.net.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/ascarter.net.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
title: //h1[@class='article_title']
|
||||
author: //span[@class='author']
|
||||
date: //h2[@class='dateline']
|
||||
body: //div[@class='article_body']
|
||||
test_url: http://ascarter.net/2012/02/20/enough-is-enough.html
|
7
inc/3rdparty/site_config/standard/astronews.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/astronews.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title: //span[@class='titel']
|
||||
author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
|
||||
date: substring-after(//span[@class='metadaten_C'],'astronews.com')
|
||||
strip: //span[@class='bu']
|
||||
strip_image_src: '/_images/'
|
||||
|
||||
test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml
|
8
inc/3rdparty/site_config/standard/asymco.com.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/asymco.com.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Johannes Stühler
|
||||
|
||||
title://h2
|
||||
author://span[@class='meta-content']
|
||||
date://abbr[@class='date published']/@title
|
||||
body://div[@class='entry-content']
|
||||
|
||||
test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/
|
6
inc/3rdparty/site_config/standard/autoblog.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/autoblog.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
prune: no
|
||||
body: //div[@class='post-body']
|
||||
author: //p[@class='byline']//a
|
||||
date: substring-after(//div[@class='about']/p[2], 'Posted')
|
||||
strip: //div[@class='body']/div[@class='meta']
|
||||
test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/
|
4
inc/3rdparty/site_config/standard/avclub.com.txt
vendored
Normal file
4
inc/3rdparty/site_config/standard/avclub.com.txt
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
author: //*[@id="article_wrapper"]/div[1]/a[1]
|
||||
body: //*[@id="article_wrapper"]/div[2]
|
||||
date: //*[@id="article_wrapper"]/div[1]/text()[2]
|
||||
test_url: http://www.avclub.com/articles/forgetmenot,70904
|
12
inc/3rdparty/site_config/standard/baltimoresun.com.txt
vendored
Normal file
12
inc/3rdparty/site_config/standard/baltimoresun.com.txt
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
|
||||
convert_double_br_tags: yes
|
||||
|
||||
title: //div[@class="story"]/h1
|
||||
body: //div[@id="story-body-text"]
|
||||
author: //span[@class="byline"]
|
||||
date: //p[@class="date"]
|
||||
|
||||
strip: //*[@class='all']
|
||||
strip: //*[@class='articlerail']
|
||||
|
||||
test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story
|
7
inc/3rdparty/site_config/standard/basicthinking.de.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/basicthinking.de.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title: //h2
|
||||
date: //span[@class='date']
|
||||
body: //div[@class='entry']
|
||||
|
||||
strip: //div[@class='zusatz']
|
||||
|
||||
test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/
|
13
inc/3rdparty/site_config/standard/bb.is.txt
vendored
Normal file
13
inc/3rdparty/site_config/standard/bb.is.txt
vendored
Normal file
|
@ -0,0 +1,13 @@
|
|||
author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
|
||||
|
||||
|
||||
date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
|
||||
|
||||
|
||||
body: //div[@class='first-article-big']
|
||||
strip: //table[@class='newsimagecontainer']
|
||||
strip: //h3[@class='headlines']
|
||||
strip: //iframe[@class='headlines']
|
||||
strip: //a[@class='newslink']
|
||||
convert_double_br_tags: yes
|
||||
test_url: http://bb.is/Pages/82?NewsID=174119
|
32
inc/3rdparty/site_config/standard/bbc.co.uk.txt
vendored
Normal file
32
inc/3rdparty/site_config/standard/bbc.co.uk.txt
vendored
Normal file
|
@ -0,0 +1,32 @@
|
|||
body: //div[@class="story-body"]
|
||||
title: //h1[@class="story-header"]
|
||||
date: //span[@class="story-date"]/span[@class='date']
|
||||
|
||||
# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
|
||||
body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
|
||||
|
||||
#strip: //div[@class="story-feature narrow"]
|
||||
#strip: //div[@class="story-feature wide"]
|
||||
#strip: //div[@class="story-feature dslideshow-enclosure"]
|
||||
strip: //div[contains(@class, "story-feature")]
|
||||
strip: //span[@class="story-date"]
|
||||
#strip: //div[@class="caption body-narrow-width"]
|
||||
strip: //div[@class="warning"]//p
|
||||
strip: //div[@id='page-bookmark-links-head']
|
||||
strip: //object
|
||||
strip: //div[contains(@class, "bbccom_advert_placeholder")]
|
||||
strip: //div[contains(@class, "embedded-hyper")]
|
||||
strip: //div[contains(@class, 'market-data')]
|
||||
strip: //a[contains(@class, 'hidden')]
|
||||
strip: //div[contains(@class, 'hypertabs')]
|
||||
strip: //div[contains(@class, 'related')]
|
||||
strip: //form[@id='comment-form']
|
||||
strip: //div[contains(@class, 'comment-introduction')]
|
||||
|
||||
replace_string(<noscript>): <div>
|
||||
replace_string(</noscript>): </div>
|
||||
|
||||
prune: no
|
||||
|
||||
dissolve: //h2
|
||||
test_url: http://www.bbc.co.uk/news/business-15060862
|
16
inc/3rdparty/site_config/standard/benoitmaison.org.txt
vendored
Normal file
16
inc/3rdparty/site_config/standard/benoitmaison.org.txt
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
body: //div[@class="entry-content"]
|
||||
|
||||
# Remove text ‘Tweet’
|
||||
strip: //div[@class="entry-content"]/div[last()]
|
||||
|
||||
title: h1[@class="entry-title"]
|
||||
|
||||
# If the Instapaper text parser worked with HTML5 tags, we would use:
|
||||
date: //time[@class="entry-date"]
|
||||
|
||||
# But since it does not, use this more complicated rule:
|
||||
date: //div[@class="entry-meta"]/a[@rel="bookmark"]
|
||||
|
||||
# Unfortunately, the following rule is overridden by the automatically found author.
|
||||
author: ("Benoit Maison")
|
||||
test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/
|
3
inc/3rdparty/site_config/standard/berlingske.dk.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/berlingske.dk.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
title: //h1[@class='headline']
|
||||
body: //div[contains(@class, 'article-wrapper')]
|
||||
test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa
|
2
inc/3rdparty/site_config/standard/betabeat.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/betabeat.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //div[@class="entry-content"]
|
||||
test_url: http://www.betabeat.com/2011/07/04/sheryl-sandberg-breaks-through-silicon-valleys-boys-club-sort-of/
|
7
inc/3rdparty/site_config/standard/betanews.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/betanews.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
# some articles at this site like this one doesn't
|
||||
# seem to pick up the article body via normal
|
||||
# processing, other articles come through fine
|
||||
# http://www.betanews.com/joewilcox/article
|
||||
# /Google-is-a-marketing-sensation/1309708375
|
||||
body: //*[@id="article"]
|
||||
test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375
|
8
inc/3rdparty/site_config/standard/biography.com.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/biography.com.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
title: //div[contains(@class, 'main-content')]//h1
|
||||
body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
|
||||
|
||||
test_url: http://www.biography.com/print/profile/martin-luther-9389283
|
2
inc/3rdparty/site_config/standard/bitelia.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/bitelia.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //*[(@class = "historia")]
|
||||
test_url: http://bitelia.com/2011/09/klout-midiendo-influencia
|
7
inc/3rdparty/site_config/standard/bjango.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/bjango.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title: //h1[@class='articlehead']
|
||||
body: //div[@class='column']
|
||||
strip: //h1
|
||||
strip: //div[@class='help']
|
||||
|
||||
#no author or date/time provided in current layout
|
||||
test_url: http://bjango.com/articles/actions/
|
8
inc/3rdparty/site_config/standard/blog.arsln.org.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/blog.arsln.org.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
tidy: no
|
||||
prune: no
|
||||
date: //article/header/h6/time
|
||||
title: //article/header/h3
|
||||
author: //meta[@name='author']/@content
|
||||
body: //article//post
|
||||
|
||||
test_url: http://blog.arsln.org/aska-ayip-oluyor/
|
7
inc/3rdparty/site_config/standard/blog.asmartbear.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/blog.asmartbear.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title: //title
|
||||
author: //span[@class='author vcard']/a
|
||||
date: //p[@class='headline_meta']/abbr[@class='published']
|
||||
body: //div[@class='format_text entry-content']
|
||||
|
||||
strip: //div[@id='dd_ajax_float']
|
||||
test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html
|
9
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt
vendored
Normal file
9
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
# Instapaper gets this back to front and only gets the blog title instead of the article title.
|
||||
title: substring-before(//title, '-')
|
||||
|
||||
author: //a[ contains(@href, '/people') ]
|
||||
|
||||
body: //div[ @class='post' ]
|
||||
|
||||
# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
|
||||
test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
|
5
inc/3rdparty/site_config/standard/blog.fefe.de.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/blog.fefe.de.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
title: //h2
|
||||
date: //h3
|
||||
body: //ul
|
||||
|
||||
test_url: http://blog.fefe.de/?ts=b063bf55
|
11
inc/3rdparty/site_config/standard/blog.instagram.com.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/blog.instagram.com.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
# clean Instagram blog a little bit
|
||||
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
body://div[contains(@id,'content')]
|
||||
|
||||
strip_id_or_class:meta
|
||||
strip_id_or_class:notes
|
||||
strip_id_or_class:pagination
|
||||
test_url: http://blog.instagram.com/post/8757832007/fromwhereistand
|
4
inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt
vendored
Normal file
4
inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
date: //span[contains(@class, 'date-links')]
|
||||
author: //span[contains(@class, 'author-links')]
|
||||
body: //div[contains(@class, 'entry-content')]
|
||||
test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web
|
5
inc/3rdparty/site_config/standard/blog.kaelig.fr.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/blog.kaelig.fr.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
body: //*[contains(@class, 'post_content')]
|
||||
author: string('Kaelig Deloumeau-Prigent')
|
||||
title: //h1[@class='title']
|
||||
date: //span[@class='date']
|
||||
test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par
|
6
inc/3rdparty/site_config/standard/blog.naver.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/blog.naver.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //span[@class='pcol1 itemSubjectBoldfont']
|
||||
body: //div[@id='postListBody']
|
||||
date: //p[@class='date fil5 pcol2']
|
||||
single_page_link: /html/frameset/frame[1]/attribute::src
|
||||
strip: //div[@class='post-btn']
|
||||
test_url: http://blog.naver.com/how2invest/110135068757
|
12
inc/3rdparty/site_config/standard/blog.pchome.net.txt
vendored
Normal file
12
inc/3rdparty/site_config/standard/blog.pchome.net.txt
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
# PCHOME blog, a popular Chinese blog host
|
||||
# Oct 15, 2011
|
||||
#
|
||||
|
||||
title://*[contains(@class,'imp')]/h2
|
||||
|
||||
date://*[contains(@class,'imp')]/span
|
||||
body://div[contains(@id,'blog_content')]
|
||||
|
||||
|
||||
|
||||
test_url: http://blog.pchome.net/article/462502.html
|
6
inc/3rdparty/site_config/standard/blog.pinboard.in.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/blog.pinboard.in.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //a[@class="blog_title"]
|
||||
date: //p[@class="when"]/a
|
||||
body: //div[@class="blog_entry"]
|
||||
strip_id_or_class:blog_title
|
||||
strip_id_or_class:when
|
||||
test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/
|
26
inc/3rdparty/site_config/standard/blog.sina.com.cn.txt
vendored
Normal file
26
inc/3rdparty/site_config/standard/blog.sina.com.cn.txt
vendored
Normal file
|
@ -0,0 +1,26 @@
|
|||
# Sina blog, the most popular blog host in China.
|
||||
# Its source code is horrible.
|
||||
#
|
||||
# Issue:
|
||||
# Only the first image in the article is displayed.
|
||||
# The rest images are replace by a 1x1 transparent gif by sina blog host.
|
||||
#
|
||||
|
||||
title://*[contains(@class,'titName SG_txta')]
|
||||
author://*[contains(@id,'ownernick')]
|
||||
date://*[contains(@class,'time SG_txtc')]
|
||||
body://div[contains(@class,'articalContent')]
|
||||
|
||||
# Remove redundant content which has span class start with "MASS"
|
||||
# Example <span class="MASSf21674ffeef7"></span>
|
||||
strip://span[contains(@class,'MASS')]
|
||||
|
||||
# Remove comment
|
||||
strip://div[contains(@class,'allComm')]
|
||||
|
||||
# Remove hiden text and link
|
||||
strip://ins
|
||||
|
||||
tidy:no
|
||||
convert_double_br_tags:yes
|
||||
test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html
|
2
inc/3rdparty/site_config/standard/blog.spu.edu.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/blog.spu.edu.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body://div[@class='post']
|
||||
test_url: http://blog.spu.edu/lectio/from-the-frying-pan-into-the-fire/
|
6
inc/3rdparty/site_config/standard/blog.wells.ee.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/blog.wells.ee.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //h2/a[@class="no-link title"]
|
||||
author: //h2[@id="blog_owner"]
|
||||
date: //time
|
||||
strip: //h2/a[@class="no-link title"]
|
||||
test_url: http://blog.wells.ee/retina
|
||||
test_url: http://blog.wells.ee/skeuomorphism
|
8
inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
# 2011-08-23 [carlo@...] Initial version.
|
||||
|
||||
author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
|
||||
|
||||
# why yes, I do feel a bit dirty
|
||||
date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
|
||||
|
||||
test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero
|
2
inc/3rdparty/site_config/standard/blogs.forbes.com.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/blogs.forbes.com.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //div[@class='entry']
|
||||
test_url: http://blogs.forbes.com/adamhartung/2011/04/08/apple-is-better-managed-than-microsoft/
|
4
inc/3rdparty/site_config/standard/blogs.hbr.org.txt
vendored
Normal file
4
inc/3rdparty/site_config/standard/blogs.hbr.org.txt
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
title: //div[@id='pageFeature']/h1
|
||||
body: //div[@id='articleBody']
|
||||
strip: //div[@class='module wide']
|
||||
test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29
|
6
inc/3rdparty/site_config/standard/blogs.msdn.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/blogs.msdn.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //h3[@class="post-name"]
|
||||
author: //span[@class="user-name"]
|
||||
date: //div[@class="post-date"]
|
||||
body: //div[@class="post-content user-defined-markup"]
|
||||
footnotes: no
|
||||
test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx
|
3
inc/3rdparty/site_config/standard/blogs.reuters.com.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/blogs.reuters.com.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
title: //div[@id='single']/h1
|
||||
body: //div[@id='postcontent']
|
||||
test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/
|
16
inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt
vendored
Normal file
16
inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
# meta data
|
||||
title://h1[@class = 'postTitle']
|
||||
author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|')
|
||||
date://span[@class = 'datestamp']
|
||||
|
||||
#body content
|
||||
body://div[@id = 'singleBlogPost']
|
||||
|
||||
#reclaim author info
|
||||
move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv']
|
||||
strip://p[@class = 'moreLink mobileHide']
|
||||
|
||||
#cleanup comments, there might be some open <div> sections
|
||||
strip://div[@id = 'comments2']
|
||||
strip://h3[a[@href = '#add-comment']]
|
||||
test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/
|
15
inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt
vendored
Normal file
15
inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
# metadata
|
||||
author://div[@class = 'post']/div[@class='meta']/a[1]
|
||||
date://div[@id = 'rap']/h2[1]
|
||||
body://div[@class = 'post']
|
||||
|
||||
# wrapping caption and image
|
||||
wrap_in(fieldset)://div[contains(@class, 'wp-caption')]
|
||||
|
||||
|
||||
# clean up
|
||||
strip://div[@class = 'post']/h3[@class = 'storytitle']
|
||||
strip://div[@class = 'post']/div[@class = 'social']
|
||||
strip://img[@style = 'display:none;']
|
||||
strip://img[@height='0' and @width='0']
|
||||
test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/
|
6
inc/3rdparty/site_config/standard/blogs.technet.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/blogs.technet.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
title: //h3[@class="post-name"]
|
||||
author: //span[@class="user-name"]
|
||||
date: //div[@class="post-date"]
|
||||
body: //div[@class="post-content user-defined-markup"]
|
||||
footnotes: no
|
||||
test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx
|
4
inc/3rdparty/site_config/standard/bluetouff.com.txt
vendored
Normal file
4
inc/3rdparty/site_config/standard/bluetouff.com.txt
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
body://div[@class='entry']
|
||||
date://div[@class='meta']
|
||||
strip://a[@class='FlattrButton']
|
||||
test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/
|
8
inc/3rdparty/site_config/standard/boagworld.com.txt
vendored
Normal file
8
inc/3rdparty/site_config/standard/boagworld.com.txt
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
title: //h1[@class="entry-title"][2]
|
||||
author: string("Paul Boag")
|
||||
date: substring(//span[@class="meta"], 11)
|
||||
body: //article
|
||||
strip: //h2
|
||||
strip: //h1
|
||||
strip: //div[@id="callsToAction"]
|
||||
test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/
|
11
inc/3rdparty/site_config/standard/boingboing.net.txt
vendored
Normal file
11
inc/3rdparty/site_config/standard/boingboing.net.txt
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
# This is far from perfect, but so is BoingBoing's markup
|
||||
title: //h2[@class="headline"]
|
||||
single_page_link: //h2[@class="headline"]/a
|
||||
#date: //p[@class="byline"]
|
||||
body: //div[@class="post"]
|
||||
|
||||
strip_id_or_class: shareMe
|
||||
strip_id_or_class: authorbox
|
||||
strip_id_or_class: byline
|
||||
|
||||
test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html
|
3
inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
title: //h2[@class='entry-title']
|
||||
body: //div[@class='entry-content']
|
||||
test_url: http://boldizsar.palotas.eu/blog/?p=1394
|
6
inc/3rdparty/site_config/standard/book.douban.com.txt
vendored
Normal file
6
inc/3rdparty/site_config/standard/book.douban.com.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
body: //span[@property='v:description']
|
||||
date: //span[@property='v:dtreviewed']
|
||||
author: //span[@property='v:reviewer']
|
||||
prune: no
|
||||
|
||||
test_url: http://book.douban.com/review/2422662/
|
19
inc/3rdparty/site_config/standard/bookforum.com.txt
vendored
Normal file
19
inc/3rdparty/site_config/standard/bookforum.com.txt
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
#metadata
|
||||
title://div[@class = 'Topper']/h1
|
||||
author://div[@class = 'Topper']/h3
|
||||
date://div[@class = 'Topper']/h6
|
||||
body://div[@class = 'Core']
|
||||
|
||||
|
||||
|
||||
# clean up
|
||||
strip://div[@class = 'Topper']/h1
|
||||
strip://div[@class = 'Topper']/h3
|
||||
strip://div[@class = 'Topper']/h4
|
||||
strip://div[@class = 'Topper']/h5
|
||||
strip://div[@class = 'Topper']/h6
|
||||
strip://br[@clear = 'all']
|
||||
strip://div[@class = 'adCore']
|
||||
strip://div[@class = 'BookR']
|
||||
strip://div[@class = 'InfoBox']
|
||||
test_url: http://bookforum.com/inprint/018_04/8595
|
7
inc/3rdparty/site_config/standard/borderhouseblog.com.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/borderhouseblog.com.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
title://h1
|
||||
author://div[@class="meta"]/span/a
|
||||
date://div[@class="date"]
|
||||
body://div[@class="content article"]
|
||||
strip://div[@class="content article"]/h1
|
||||
|
||||
test_url: http://borderhouseblog.com/?p=7832
|
16
inc/3rdparty/site_config/standard/bostonglobe.com.txt
vendored
Normal file
16
inc/3rdparty/site_config/standard/bostonglobe.com.txt
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
|
||||
|
||||
title: //div[@class="header"]/h1
|
||||
author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")
|
||||
date: //div[@class="byline"]/p[last()]
|
||||
body: //div[@class="article-body"]
|
||||
|
||||
strip_id_or_class: aside
|
||||
strip_id_or_class: promo
|
||||
strip_id_or_class: skip-nav
|
||||
strip_id_or_class: article-more
|
||||
strip_id_or_class: article-bar
|
||||
|
||||
# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
|
||||
strip_id_or_class: figure
|
||||
test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html
|
15
inc/3rdparty/site_config/standard/bostonreview.net.txt
vendored
Normal file
15
inc/3rdparty/site_config/standard/bostonreview.net.txt
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
#basics
|
||||
title://h3[@class = 'article_title']
|
||||
date://span[@class = 'article_date']
|
||||
body://div[@id = 'center_column_article']
|
||||
#correct, but author not being picked up in preview
|
||||
author://span[@class = 'article_author']
|
||||
|
||||
#strips basics from article
|
||||
strip_id_or_class:article_title
|
||||
strip_id_or_class:article_date
|
||||
strip_id_or_class:article_author
|
||||
|
||||
#strips pull quotes
|
||||
strip_id_or_class:pull_quote
|
||||
test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php
|
5
inc/3rdparty/site_config/standard/boundlessline.org.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/boundlessline.org.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
title: substring-before(//title, '|')
|
||||
body: //div[@class="entry"]
|
||||
# Remove the author's picture
|
||||
strip: //div[@class="entry"]/a[1]
|
||||
test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html
|
10
inc/3rdparty/site_config/standard/brainfacts.org.txt
vendored
Normal file
10
inc/3rdparty/site_config/standard/brainfacts.org.txt
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
title: //div[@class="standard"]/h1
|
||||
author: string("BrainFacts.org")
|
||||
date: //div[@class="meta"]/strong
|
||||
|
||||
strip: //p[@class="skip"]
|
||||
strip: //div[@class="meta"]
|
||||
strip: //div[@class="standard"]/h1
|
||||
strip: //div[@class="modal"]
|
||||
strip: //div[@class="columnRight"]
|
||||
test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/
|
7
inc/3rdparty/site_config/standard/brandeins.de.txt
vendored
Normal file
7
inc/3rdparty/site_config/standard/brandeins.de.txt
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
# set body
|
||||
body: //div[@id='theContent']
|
||||
|
||||
# set title
|
||||
title: //div[@id='theContent']/h3
|
||||
strip: //div[@id='theContent']/h3
|
||||
test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html
|
3
inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt
vendored
Normal file
3
inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
date://h2[@class="date-header"]
|
||||
body://div[@class="entry-content"]
|
||||
test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html
|
5
inc/3rdparty/site_config/standard/brettterpstra.com.txt
vendored
Normal file
5
inc/3rdparty/site_config/standard/brettterpstra.com.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
body: //div[@class='post full']
|
||||
title: //h1
|
||||
author: substring-after(//title, '- ')
|
||||
date: //span[@class='date']
|
||||
test_url: http://brettterpstra.com/byword-for-ios/
|
2
inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt
vendored
Normal file
2
inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
body: //div[@class='articleBody']
|
||||
test_url: http://www.brisbanetimes.com.au/opinion/blogs/blunt-instrument/losing-our-minds--for-24-hours-20120118-1q682.html
|
13
inc/3rdparty/site_config/standard/brookings.edu.txt
vendored
Normal file
13
inc/3rdparty/site_config/standard/brookings.edu.txt
vendored
Normal file
|
@ -0,0 +1,13 @@
|
|||
title: //div[@id='contentheader']/h1
|
||||
author: //p[@class='attribution']/span[@class='author']/*
|
||||
# Is there a way to pull multiple authors? My XPath here is just grabbing the first
|
||||
|
||||
date: /html/head/meta[@name="date"]/@content
|
||||
body: //div[@class='main-content']
|
||||
|
||||
strip: //p[@class='byline']
|
||||
strip: //div[@class='img-gallery']
|
||||
strip: //div[@class='callout']
|
||||
strip: //div[@class='add-your-view']
|
||||
convert_double_br_tags: yes
|
||||
test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue