<span class="com"># -*- coding: utf-8 -*-</span> <span class="kwd">import</span><span class="pln"> scrapy </span><span class="com"># item class included here </span> <span class="kwd">class</span> <span class="typ">DmozItem</span><span class="pun">(</span><span class="pln">scrapy</span><span class="pun">.</span><span class="typ">Item</span><span class="pun">):</span> <span class="com"># define the fields for your item here like:</span><span class="pln"> link </span><span class="pun">=</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Field</span><span class="pun">()</span><span class="pln"> attr </span><span class="pun">=</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Field</span><span class="pun">()</span> <span class="kwd">class</span> <span class="typ">DmozSpider</span><span class="pun">(</span><span class="pln">scrapy</span><span class="pun">.</span><span class="typ">Spider</span><span class="pun">):</span><span class="pln"> name </span><span class="pun">=</span> <span class="str">"dmoz"</span><span class="pln"> allowed_domains </span><span class="pun">=</span> <span class="pun">[</span><span class="str">"craigslist.org"</span><span class="pun">]</span><span class="pln"> start_urls </span><span class="pun">=</span> <span class="pun">[</span> <span class="str">"http://chicago.craigslist.org/search/emd?"</span> <span class="pun">]</span><span class="pln"> BASE_URL </span><span class="pun">=</span> <span class="str">'http://chicago.craigslist.org/'</span> <span class="kwd">def</span><span class="pln"> parse</span><span class="pun">(</span><span class="pln">self</span><span class="pun">,</span><span class="pln"> response</span><span class="pun">):</span><span class="pln"> links </span><span class="pun">=</span><span class="pln"> response</span><span class="pun">.</span><span class="pln">xpath</span><span class="pun">(</span><span class="str">'//a[@class="hdrlnk"]/@href'</span><span class="pun">).</span><span class="pln">extract</span><span class="pun">()</span> <span class="kwd">for</span><span class="pln"> link </span><span class="kwd">in</span><span class="pln"> links</span><span class="pun">:</span><span class="pln"> absolute_url </span><span class="pun">=</span><span class="pln"> self</span><span class="pun">.</span><span class="pln">BASE_URL </span><span class="pun">+</span><span class="pln"> link </span><span class="kwd">yield</span><span class="pln"> scrapy</span><span class="pun">.</span><span class="typ">Request</span><span class="pun">(</span><span class="pln">absolute_url</span><span class="pun">,</span><span class="pln"> callback</span><span class="pun">=</span><span class="pln">self</span><span class="pun">.</span><span class="pln">parse_attr</span><span class="pun">)</span> <span class="kwd">def</span><span class="pln"> parse_attr</span><span class="pun">(</span><span class="pln">self</span><span class="pun">,</span><span class="pln"> response</span><span class="pun">):</span><span class="pln"> item </span><span class="pun">=</span> <span class="typ">DmozItem</span><span class="pun">()</span><span class="pln"> item</span><span class="pun">[</span><span class="str">"link"</span><span class="pun">]</span> <span class="pun">=</span><span class="pln"> response</span><span class="pun">.</span><span class="pln">url item</span><span class="pun">[</span><span class="str">"attr"</span><span class="pun">]</span> <span class="pun">=</span> <span class="str">""</span><span class="pun">.</span><span class="pln">join</span><span class="pun">(</span><span class="pln">response</span><span class="pun">.</span><span class="pln">xpath</span><span class="pun">(</span><span class="str">"//p[@class='attrgroup']//text()"</span><span class="pun">).</span><span class="pln">extract</span><span class="pun">())</span> <span class="kwd">return</span><span class="pln"> item</span>