In this tutorial we will scrap housing price in Melbourne from this web site: https://www.domain.com.au/auction-results/melbourne/

Required python packages

We need urllib for connect to the website, and BeautifulSoup for parsing html sources. After that, the data can be stored in a json file.

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import json
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

def getListings(url):
    "Get a list of auction results"
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None    
    try:
        # Parsing data
        bs = BeautifulSoup(html.read(), 'html.parser')
        articles = bs.findAll('article', {'class':'css-3xqrp1'})
        for atc in articles:
            for c in atc.children:
                if c.name == 'header':
                    suburb = c.h3.text
                if c.name == 'ul':
                    getListing(c, suburb)
    except AttributeError as e:
        return None
    
def getListing(tag, suburb=None):
    " Get a list in each suburb "
    ladd,lagen,htype,hInfo,soldInfo,price = tuple(["Unknown" for i in range(6)])
    listing = list(tag.children)
    ladd  = listing[0].text
    if listing[1].name == 'li':
        htype,hInfo = getHouseInfo(listing[1])
    if listing[2].name == 'li':
        soldInfo,price = getSoldInfo(listing[2])
    if listing[3].name == 'li':
        lagen = listing[3].text
    listings.append( {'suburb':suburb, 'street':ladd, 'agent':lagen, 'type': htype, 'info': hInfo, 
                      'sold':soldInfo, 'price':price})
    
def getSoldInfo(tag):
    sold = list(tag.children)
    if len(sold) >= 2: return sold[0].text, sold[1].text
    else: return sold[0].text, "Unknown"

def getHouseInfo(tag):
    house = list(tag.children)
    if len(house) >= 2: return house[0].text, house[1].text
    else: return house[0].text, "Unknown"

listings = [] # Store all listings
url = "https://www.domain.com.au/auction-results/melbourne/"
getListings(url)

Check results

len(listings)

637

listings[1:5]

[{'suburb': 'Abbotsford',
  'street': '1/47 Nicholson St',
  'agent': 'Biggin & Scott Richmond',
  'type': 'Townhouse',
  'info': '2 beds',
  'sold': 'Sold prior to auction',
  'price': '$1.12m'},
 {'suburb': 'Abbotsford',
  'street': '12 Paterson St',
  'agent': 'Biggin & Scott Richmond',
  'type': 'House',
  'info': '4 beds',
  'sold': 'Sold',
  'price': '$1.886m'},
 {'suburb': 'Abbotsford',
  'street': '4 Turner St',
  'agent': 'Jellis Craig Fitzroy',
  'type': 'House',
  'info': '3 beds',
  'sold': 'Sold',
  'price': 'Price withheld'},
 {'suburb': 'Airport West',
  'street': '2/74 Fraser St',
  'agent': 'Barry Plant Essendon',
  'type': 'Unit',
  'info': '3 beds',
  'sold': 'Sold prior to auction',
  'price': '$671.5k'}]

Store results

with open('listings.json', 'w') as f:
    json.dump(listings, f)

df = pd.read_json('listings.json')

df.shape

(637, 7)

df.head()

df['type'].value_counts().plot(kind='bar')

<AxesSubplot:>

df['info'].value_counts().plot(kind='bar')

<AxesSubplot:>

	suburb	street	agent	type	info	sold	price
0	Abbotsford	34 Albert St	Unknown	Unknown	Unknown	House
1	Abbotsford	1/47 Nicholson St	Biggin & Scott Richmond	Townhouse	2 beds	Sold prior to auction	$1.12m
2	Abbotsford	12 Paterson St	Biggin & Scott Richmond	House	4 beds	Sold	$1.886m
3	Abbotsford	4 Turner St	Jellis Craig Fitzroy	House	3 beds	Sold	Price withheld
4	Airport West	2/74 Fraser St	Barry Plant Essendon	Unit	3 beds	Sold prior to auction	$671.5k