Source code for extract

#!/usr/bin/python
"""Extracting Data from Zillow, and storing into a csv file to be graphed.
"""
import os
import re as regex
from bs4 import BeautifulSoup
import requests
import unicodecsv as csv
from webcrawler.crawler import WebCrawler

__author__ = "Disaiah Bennett"
__version__ = 1.0

[docs]def main(): """Extracting housing information from zillow by utilizing the web-crawler. """ url = "https://www.zillow.com/homes/for_sale/" # THE URL head = "https://www.zillow.com/" # THE HEAD OF THE URL tail = "_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy" zipcode = "27909" # ZIP CODES total = 0 # NUMBER OF URL page_num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] crawler = WebCrawler() csv_file = csv.writer(open("zillow_homes_%s.csv" % zipcode, "wb")) with requests.Session() as session: for i, _ in enumerate(page_num): crawler.url = "%s%s/%d_p/%s" % (url, zipcode, page_num[i],tail) print crawler.url crawler.page = session.get(crawler.get_url(), headers=crawler.get_header()) soup = BeautifulSoup(crawler.get_data(), 'html.parser') prop_links = soup.findAll('a', {"class": "zsg-photo-card-overlay-link routable hdp-link routable mask hdp-link"}) crawler.links = [link.get('href') for _, link in enumerate(prop_links)] print "Zip Code: %s\n" % zipcode, \ "----------------------------------------------------------------------------------" for i, link in enumerate(crawler.get_links()): crawler.sub_url = "%s%s" % (head, link) if crawler.sub_url: total += 1 print "%d %s" % (total, crawler.sub_url) crawler.page = session.get(crawler.sub_url, headers=crawler.get_header()) soup = BeautifulSoup(crawler.get_data(), 'html.parser') house_years = regex.findall(r'<div class="fact-value">Built in (.*?)<', str(soup)) house_address = soup.findAll('div', {"class": "zsg-h1 hdp-home-header-st-addr"}) house_address = soup.findAll('div', {"class": "zsg-h1 hdp-home-header-st-addr"}) house_cost = soup.findAll('div', {"class": "price"}) house_size = soup.findAll('h3', {"class": "edit-facts-light"}) for _, address in enumerate(house_address): if address: print address.text crawler.address(address.text) break for _, year in enumerate(house_years): if year: print year crawler.year(int(year)) break for _, cost in enumerate(house_cost): if cost: print cost.text.replace("and up", "").replace("from", "") crawler.price(float(cost.text.replace("and up", "").replace("from", "").replace("$", "").replace(",", "").replace("From:", ""))) break for _, size in enumerate(house_size): if size: if "baths" in size.text: try: print size.text.replace("--", "N/A").split("baths")[1] crawler.size(size.text.replace("--", "0").replace("sqft", "").replace(",", "").split("baths")[1]) except IndexError: pass elif "bath" in size.text: try: print size.text.replace("--", "N/A").split("bath")[1] crawler.size(size.text.replace("--", "0").replace("sqft", "").replace(",", "").split("bath")[1]) except IndexError: pass else: print size.text crawler.size(size.text) print "\n" if house_address and house_cost and house_size and house_years: try: crawler.prop_size[i] = crawler.prop_size[i].replace("sqft", "").replace("acres", "").replace("--", "").replace(",", "") csv_file.writerow([crawler.prop_address[i], crawler.prop_year[i], float(crawler.prop_size[i]), crawler.prop_price[i]]) except IndexError: pass elif house_address and house_cost and house_size: try: crawler.prop_size[i] = crawler.prop_size[i].replace("sqft", "").replace("acres", "").replace("--", "0").replace(",", "") csv_file.writerow([crawler.prop_address[i], "N/A", float(crawler.prop_size[i]), crawler.prop_price[i]]) except IndexError: pass elif house_address and house_size and house_years: try: crawler.prop_size[i] = crawler.prop_size[i].replace("sqft", "").replace("acres", "").replace("--", "0").replace(",", "") csv_file.writerow([crawler.prop_address[i], crawler.prop_year[i], float(crawler.prop_size[i]), 0.0]) except IndexError: pass crawler.data_clear() # os.system(". move_csv.sh") os.system("python webcrawler/analyze.py")
if __name__ == "__main__": main()