Collecting Melbourne housing price by Beautiful Soup
In this tutorial we will scrap housing price in Melbourne.
In this tutorial we will scrap housing price in Melbourne from this web site: https://www.domain.com.au/auction-results/melbourne/
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import json
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
def getListings(url):
"Get a list of auction results"
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
# Parsing data
bs = BeautifulSoup(html.read(), 'html.parser')
articles = bs.findAll('article', {'class':'css-3xqrp1'})
for atc in articles:
for c in atc.children:
if c.name == 'header':
suburb = c.h3.text
if c.name == 'ul':
getListing(c, suburb)
except AttributeError as e:
return None
def getListing(tag, suburb=None):
" Get a list in each suburb "
ladd,lagen,htype,hInfo,soldInfo,price = tuple(["Unknown" for i in range(6)])
listing = list(tag.children)
ladd = listing[0].text
if listing[1].name == 'li':
htype,hInfo = getHouseInfo(listing[1])
if listing[2].name == 'li':
soldInfo,price = getSoldInfo(listing[2])
if listing[3].name == 'li':
lagen = listing[3].text
listings.append( {'suburb':suburb, 'street':ladd, 'agent':lagen, 'type': htype, 'info': hInfo,
'sold':soldInfo, 'price':price})
def getSoldInfo(tag):
sold = list(tag.children)
if len(sold) >= 2: return sold[0].text, sold[1].text
else: return sold[0].text, "Unknown"
def getHouseInfo(tag):
house = list(tag.children)
if len(house) >= 2: return house[0].text, house[1].text
else: return house[0].text, "Unknown"
listings = [] # Store all listings
url = "https://www.domain.com.au/auction-results/melbourne/"
getListings(url)
len(listings)
listings[1:5]
with open('listings.json', 'w') as f:
json.dump(listings, f)
df = pd.read_json('listings.json')
df.shape
df.head()
df['type'].value_counts().plot(kind='bar')
df['info'].value_counts().plot(kind='bar')