PDA

View Full Version : Fix for TV.com scraper


korin
2007-07-24, 23:10
The TV.com HTML format changed slightly so I have managed to fix the TV.com scraper so that it returns a proper title, instead of an empty string. Here is the XML for anyone who is interested.


<?xml version="1.0" encoding="UTF-8"?>
<scraper name="TV.com" content="tvshows" thumb="tvcom.png">
<CreateSearchUrl dest="3">
<RegExp input="$$1" output="&lt;url&gt;http://www.tv.com/search.php?stype=program&amp;amp;qs=\1&amp;amp;tag=tv_show s&lt;/url&gt;"

dest="3">
<expression></expression>
</RegExp>
</CreateSearchUrl>
<GetSearchResults dest="3">
<RegExp input="$$4" output="&lt;results&gt;\1&lt;/results&gt;" dest="3">
<RegExp input="$$1" output="&lt;entity&gt;&lt;title&gt;\2&lt;/title&gt;&lt;url&gt;http://www.tv.com/show/

\1/summary.html&lt;/url&gt;&lt;url&gt;http://www.tv.com/show/\1/cast.html&lt;/url&gt;&lt;url&gt;http://www.tv.com/show/

\1/episode_listings.html&lt;/url&gt;&lt;id&gt;\1&lt;/id&gt;&lt;/entity&gt;" dest="4">
<expression repeat="yes">Show: &lt;a class=&quot;f-bold f-C30&quot; href=&quot;http://www\.tv\.com/[^/]*/show/([0-9]+)/

[^&quot;]*&quot;&gt;([^&lt;]*)&lt;/a&gt;</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetSearchResults>
<GetDetails dest="7">
<RegExp input="$$5" output="&lt;details&gt;\1&lt;/details&gt;" dest="7">
<RegExp input="$$1" output="&lt;title&gt;\1&lt;/title&gt;" dest="5">
<expression>&lt;h1&gt;([^&gt;]*)&lt;/h1&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;genre&gt;\1&lt;/genre&gt;" dest="5+">
<expression repeat="yes">;genre&quot;&gt;([^&gt;]*)&lt;/a&gt;</expression>
</RegExp>
<RegExp input="$$8" output="&lt;plot&gt;\1&lt;/plot&gt;" dest="5+">
<RegExp input="$$1" output="\1" dest="8">
<expression>'Close Full Summary \[-\]', '([^+]*)', '</expression>
</RegExp>
<RegExp input="$$8" output="\1" dest="8">
<expression repeat="yes">([^\\]*)\\?</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
<RegExp input="$$1" output="\1" dest="7">
<expression noclean="1">id=&quot;summary_fold&quot; class=&quot;mt-10&quot;&gt;[^a-zA-Z&lt;]*(.*)&lt;/div&gt;[^a-zA-z]

*&lt;/div&gt;[^a-zA-Z]*&lt;div class=&quot;divider&quot;</expression>
</RegExp>
<RegExp input="$$7" output="\1" dest="7">
<expression trim="1">([^{]*)(&lt;/div&gt;.*)</expression>
</RegExp>
<RegExp input="$$7" output="&lt;plot&gt;\1&lt;/plot&gt;" dest="5+">
<expression></expression>
</RegExp>
<!-- Runtime is not being used here
<RegExp input="$$1" output="&lt;runtime&gt;\1 min&lt;/runtime&gt;" dest="5+">
<expression>\(([0-9]*) min.\)</expression>
</RegExp> -->
<RegExp input="$$1" output="&lt;rating&gt;\1&lt;/rating&gt;" dest="5+">
<expression>Score:&lt;/span&gt;[^&lt;]*&lt;span[^&gt;]*&gt;([0-9\.]*)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;votes&gt;\1&lt;/votes&gt;" dest="5+">
<expression>&lt;span class=&quot;f-11&quot;&gt;([0-9,]*) votes&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$2" output="&lt;actor&gt;&lt;name&gt;\1&lt;/name&gt;&lt;role&gt;\2&lt;/role&gt;&lt;/actor&gt;" dest="5+">
<expression repeat="yes">\?tag=stars[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;&lt;br /&gt;[^&lt;]*&lt;span class=&quot;f-

bold&quot;&gt;Role: ([^&lt;]*)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;thumb&gt;\1&lt;/thumb&gt;" dest="5+">
<expression>(http://image\.com\.com/tv/images/content_headers/program/[0-9]*\.jpg)</expression>
</RegExp>
<RegExp input="$$1" output="&lt;status&gt;\1&lt;/status&gt;" dest="5+">
<expression trim="1">Status[^&lt;]*&lt;span class=&quot;f-333&quot;&gt;[^a-zA-Z]*([^&lt;]*)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;premiered&gt;\1&lt;/premiered&gt;" dest="5+">
<expression trim="1">Premiered:[^&lt;]*&lt;span class=&quot;f-333&quot;&gt;([^&lt;]*)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$8" output="&lt;episodeguide&gt;\1&lt;/episodeguide&gt;" dest="5+">
<RegExp input="$$3" output="&lt;url&gt;http://www.tv.com/show/$$4/episode_listings.html?season=1&lt;/url&gt;" dest="8">
<expression>\|[^&lt;]+Next Season</expression>
</RegExp>
<RegExp input="$$3" output="&lt;url&gt;http://www.tv.com/show/$$4/episode_listings.html?season=\1&lt;/url&gt;" dest="8">
<expression repeat="yes">&lt;option[^&gt;]*tag=season_dropdown[^&gt;]*&gt;Season ([0-9]+)

&lt;/option&gt;</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetDetails>
<GetEpisodeList dest="3">
<RegExp input="$$5" output="&lt;episodeguide&gt;\1&lt;/episodeguide&gt;" dest="3">
<RegExp input="$$1" output="\1" dest="6">
<expression>&lt;span class=&quot;f-18 f-bold&quot;&gt;Season ([0-9]+)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;episode&gt;&lt;title&gt;\3&lt;/title&gt;&lt;id&gt;\2&lt;/id&gt;&lt;url

&gt;http://www.tv.com/episode/\2/summary.html&lt;/url&gt;&lt;epnum&gt;\1&lt;/epnum&gt;&lt;season&gt;$$6&lt;/season&gt;&lt;/episode&gt;" dest="5">
<expression repeat="yes">nowrap=&quot;nowrap&quot;&gt;[^0-9]*([0-9]+)[^&lt;]*&lt;/td&gt;[^&lt;]*&lt;td class=&quot;f-

bold&quot;&gt;[^&lt;]*&lt;a[^&gt;]*/episode/([0-9]*)/summary\.html\?tag=ep_list[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetEpisodeList>
<GetEpisodeDetails dest="3">
<RegExp input="$$5" output="&lt;details&gt;\1&lt;/details&gt;" dest="3">
<RegExp input="$$1" output="\1" dest="6">
<expression>&lt;div id=&quot;main-col&quot;&gt;(.*)&lt;div class=&quot;ta-r mt-10 f-bold&quot;&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;title&gt;\1&lt;/title&gt;" dest="5">
<expression>Episode: &lt;span class=&quot;f-FF9&quot;&gt;([^&lt;]*)&lt;/span&gt;&lt;br /&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;plot&gt;\1&lt;/plot&gt;" dest="5+">
<expression>div&gt;([^=]*)&lt;div class=&quot;ta-r mt-10 f-bold&quot;&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;rating&gt;\1&lt;/rating&gt;" dest="5+">
<expression>&lt;span class=&quot;f-28 f-bold mt-10 mb-10 f-FF9 db lh-18&quot;&gt;([0-9.]+)&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;votes&gt;\1&lt;/votes&gt;" dest="5+">
<expression>&lt;span class=&quot;f-11&quot;&gt;([0-9,]+) votes&lt;/span&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;aired&gt;\1&lt;/aired&gt;" dest="5+">
<expression>First Aired: ([^&amp;]*)</expression>
</RegExp>
<RegExp input="$$1" output="&lt;actor&gt;&lt;name&gt;\1&lt;/name&gt;&lt;role&gt;\2&lt;/role&gt;&lt;/actor&gt;" dest="5+">
<expression repeat="yes">&quot;&gt;([^&lt;]*)&lt;/a&gt; \(([^&lt;]*)\)[^&lt;]*&lt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;director&gt;\1&lt;/director&gt;" dest="5+">
<expression>Director:[^&lt;]*&lt;/td&gt;[^&lt;]*&lt;td&gt;[^&lt;]*&lt;a[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;credits&gt;\1&lt;/credits&gt;" dest="5+">
<expression>Writer:[^&lt;]*&lt;/td&gt;[^&lt;]*&lt;td&gt;[^&lt;]*&lt;a[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;thumb&gt;\1&lt;/thumb&gt;" dest="5+">
<expression>&quot;video-thumb mb-5&quot;[^&gt;]*image:url\(([^)]*)\);&quot;</expression>
</RegExp>
<RegExp input="$$1" output="&lt;code&gt;\1&lt;/code&gt;" dest="5+">
<expression>Prod Code: ([^&lt;]*)&lt;/span&gt;</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetEpisodeDetails>
</scraper>

HarshReality
2007-07-24, 23:33
Greatly appreciated!

sho
2007-07-27, 21:31
Please submit as a patch on SF.
http://sourceforge.net/tracker/?group_id=87054

spiff
2007-07-29, 12:15
added to svn. in the future please use the sf patch tracker

ShortySco
2007-07-29, 22:11
Yeah this is fab, Well done!

Although i prefer the theory of the tvdb.com stuff, the tv.com scraper just worked better (Before it broke)

Now i can have all the right years and genres untill the tv.db scraper/site is working a little better.

Cheers!

Shorty