immo_scrap/scrapImo.py

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import csv
import os

filename = 'offers.csv'

if os.path.exists(filename):
    os.remove(filename)

headers = ['Type', 'Prix', 'Lieu', 'Quartier', 'Surface',
           'Nb_pieces', 'Nb_chambres', 'Extra', 'Numero', 'Lien']

with open(filename, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)

f.close()

cp = input("Code postal ?\n")
max_page = input("Jusqu a quelle page ?\n")
choice = raw_input("Entrer a pour achat ou l pour location\n")

buy = 'https://www.seloger.com/list.htm?enterprise=0&natures=1,4&places=%5b%7bcp%3a' + \
    str(cp) + '%7d%5d&projects=2&qsversion=1.0&rooms=1,2,3,4&types=1'
rent = 'https://www.seloger.com/list.htm?enterprise=0&furnished=0&places=%5b%7bcp%3a' + \
    str(cp) + '%7d%5d&projects=1&qsversion=1.0&rooms=1,2,3,4&types=1'

if choice.lower() in ['achat', 'a']:
    adress = buy
elif choice.lower() in ['location', 'l']:
    adress = rent

for x in xrange(1, max_page):
    url = adress + '&LISTING-LISTpg=' + str(x)

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    browser = webdriver.Chrome(chrome_options=options)
    browser.get(url)
    html = browser.page_source
#	browser.close()

    soup = bs(html, "html.parser")

    containers = soup.findAll("div", {"class": ["c-pa-list c-pa-sl c-pa-gold cartouche ",
                                                "c-pa-list c-pa-bd c-pa-gold cartouche ", "c-pa-list c-pa-sl c-pa-silver cartouche ",
                                                "c-pa-list c-pa-bd c-pa-silver cartouche ", "c-pa-list c-pa-sl cartouche "]})

    for c in containers:
        info_c = c.findAll("div", {"class": "c-pa-info"})[0]

        h_link = info_c.a["href"].strip()
        h_quartier = h_link.split('/')[-2].replace('-', ' ')
        if 'paris' in h_quartier:
            h_quartier = h_link.split('/')[-1].replace('-', ' ')
        if '?' in h_quartier:
            h_quartier = ''

        h_type = info_c.a.text
        h_price = info_c.findAll(
            "span", {"class": "c-pa-cprice"})[0].text.strip().encode('ascii', errors='ignore')
        h_loc = info_c.findAll("div", {
                               "class": "c-pa-city"})[0].text.encode('ascii', errors='ignore').replace('me', '')
        if len(info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})) != 0:
            h_phone = info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})[
                0]["data-tooltip-focus"]

        h_nbp = ""
        h_nbch = ""
        h_extra = ""
        h_surface = ""

        print(h_type)
        print(h_price)
        print(h_loc)
        print(h_quartier)

        for info in info_c.div.text.strip().split('\n'):
            if "m" in info:
                h_surface = info.encode(
                    'ascii', errors='ignore').replace(' m', '')
                print(h_surface)
            elif "p" in info:
                h_nbp = info.replace(' p', '')
                print(h_nbp)
            elif "ch" in info:
                h_nbch = info.replace(' ch', '')
                print(h_nbch)
            else:
                h_extra = info.encode('ascii', errors='ignore')
                print(h_extra)
        print(h_phone)
        print("")

        with open(filename, 'a') as f:
            writer = csv.writer(f)
            writer.writerow([h_type, h_price, h_loc, h_quartier,
                             h_surface, h_nbp, h_nbch, h_extra, h_phone, h_link])

        f.close()