Make Scrapy follow links and collect data

<span class="com"># -*- coding: utf-8 -*-</span>
<span class="kwd">import</span><span class="pln"> scrapy


</span><span class="com"># item class included here </span>
<span class="kwd">class</span> <span class="typ">DmozItem</span><span class="pun">(</span><span class="pln">scrapy</span><span class="pun">.</span><span class="typ">Item</span><span class="pun">):</span>
    <span class="com"># define the fields for your item here like:</span><span class="pln">
    link </span><span class="pun">=</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Field</span><span class="pun">()</span><span class="pln">
    attr </span><span class="pun">=</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Field</span><span class="pun">()</span>


<span class="kwd">class</span> <span class="typ">DmozSpider</span><span class="pun">(</span><span class="pln">scrapy</span><span class="pun">.</span><span class="typ">Spider</span><span class="pun">):</span><span class="pln">
    name </span><span class="pun">=</span> <span class="str">"dmoz"</span><span class="pln">
    allowed_domains </span><span class="pun">=</span> <span class="pun">[</span><span class="str">"craigslist.org"</span><span class="pun">]</span><span class="pln">
    start_urls </span><span class="pun">=</span> <span class="pun">[</span>
    <span class="str">"http://chicago.craigslist.org/search/emd?"</span>
    <span class="pun">]</span><span class="pln">

    BASE_URL </span><span class="pun">=</span> <span class="str">'http://chicago.craigslist.org/'</span>

    <span class="kwd">def</span><span class="pln"> parse</span><span class="pun">(</span><span class="pln">self</span><span class="pun">,</span><span class="pln"> response</span><span class="pun">):</span><span class="pln">
        links </span><span class="pun">=</span><span class="pln"> response</span><span class="pun">.</span><span class="pln">xpath</span><span class="pun">(</span><span class="str">'//a[@class="hdrlnk"]/@href'</span><span class="pun">).</span><span class="pln">extract</span><span class="pun">()</span>
        <span class="kwd">for</span><span class="pln"> link </span><span class="kwd">in</span><span class="pln"> links</span><span class="pun">:</span><span class="pln">
            absolute_url </span><span class="pun">=</span><span class="pln"> self</span><span class="pun">.</span><span class="pln">BASE_URL </span><span class="pun">+</span><span class="pln"> link
            </span><span class="kwd">yield</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Request</span><span class="pun">(</span><span class="pln">absolute_url</span><span class="pun">,</span><span class="pln"> callback</span><span class="pun">=</span><span class="pln">self</span><span class="pun">.</span><span class="pln">parse_attr</span><span class="pun">)</span>

    <span class="kwd">def</span><span class="pln"> parse_attr</span><span class="pun">(</span><span class="pln">self</span><span class="pun">,</span><span class="pln"> response</span><span class="pun">):</span><span class="pln">
        item </span><span class="pun">=</span> <span class="typ">DmozItem</span><span class="pun">()</span><span class="pln">
        item</span><span class="pun">[</span><span class="str">"link"</span><span class="pun">]</span> <span class="pun">=</span><span class="pln"> response</span><span class="pun">.</span><span class="pln">url
        item</span><span class="pun">[</span><span class="str">"attr"</span><span class="pun">]</span> <span class="pun">=</span> <span class="str">""</span><span class="pun">.</span><span class="pln">join</span><span class="pun">(</span><span class="pln">response</span><span class="pun">.</span><span class="pln">xpath</span><span class="pun">(</span><span class="str">"//p[@class='attrgroup']//text()"</span><span class="pun">).</span><span class="pln">extract</span><span class="pun">())</span>
        <span class="kwd">return</span><span class="pln"> item</span>