immo_scrap/scrapImo.py
2019-09-30 14:06:36 +02:00

102 lines
3.5 KiB
Python

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import csv
import os
filename = 'offers.csv'
if os.path.exists(filename):
os.remove(filename)
headers = ['Type', 'Prix', 'Lieu', 'Quartier', 'Surface',
'Nb_pieces', 'Nb_chambres', 'Extra', 'Numero', 'Lien']
with open(filename, 'w') as f:
writer = csv.writer(f)
writer.writerow(headers)
f.close()
cp = input("Code postal ?\n")
max_page = input("Jusqu a quelle page ?\n")
choice = raw_input("Entrer a pour achat ou l pour location\n")
buy = 'https://www.seloger.com/list.htm?enterprise=0&natures=1,4&places=%5b%7bcp%3a' + \
str(cp) + '%7d%5d&projects=2&qsversion=1.0&rooms=1,2,3,4&types=1'
rent = 'https://www.seloger.com/list.htm?enterprise=0&furnished=0&places=%5b%7bcp%3a' + \
str(cp) + '%7d%5d&projects=1&qsversion=1.0&rooms=1,2,3,4&types=1'
if choice.lower() in ['achat', 'a']:
adress = buy
elif choice.lower() in ['location', 'l']:
adress = rent
for x in xrange(1, max_page):
url = adress + '&LISTING-LISTpg=' + str(x)
options = webdriver.ChromeOptions()
options.add_argument('headless')
browser = webdriver.Chrome(chrome_options=options)
browser.get(url)
html = browser.page_source
# browser.close()
soup = bs(html, "html.parser")
containers = soup.findAll("div", {"class": ["c-pa-list c-pa-sl c-pa-gold cartouche ",
"c-pa-list c-pa-bd c-pa-gold cartouche ", "c-pa-list c-pa-sl c-pa-silver cartouche ",
"c-pa-list c-pa-bd c-pa-silver cartouche ", "c-pa-list c-pa-sl cartouche "]})
for c in containers:
info_c = c.findAll("div", {"class": "c-pa-info"})[0]
h_link = info_c.a["href"].strip()
h_quartier = h_link.split('/')[-2].replace('-', ' ')
if 'paris' in h_quartier:
h_quartier = h_link.split('/')[-1].replace('-', ' ')
if '?' in h_quartier:
h_quartier = ''
h_type = info_c.a.text
h_price = info_c.findAll(
"span", {"class": "c-pa-cprice"})[0].text.strip().encode('ascii', errors='ignore')
h_loc = info_c.findAll("div", {
"class": "c-pa-city"})[0].text.encode('ascii', errors='ignore').replace('me', '')
if len(info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})) != 0:
h_phone = info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})[
0]["data-tooltip-focus"]
h_nbp = ""
h_nbch = ""
h_extra = ""
h_surface = ""
print(h_type)
print(h_price)
print(h_loc)
print(h_quartier)
for info in info_c.div.text.strip().split('\n'):
if "m" in info:
h_surface = info.encode(
'ascii', errors='ignore').replace(' m', '')
print(h_surface)
elif "p" in info:
h_nbp = info.replace(' p', '')
print(h_nbp)
elif "ch" in info:
h_nbch = info.replace(' ch', '')
print(h_nbch)
else:
h_extra = info.encode('ascii', errors='ignore')
print(h_extra)
print(h_phone)
print("")
with open(filename, 'a') as f:
writer = csv.writer(f)
writer.writerow([h_type, h_price, h_loc, h_quartier,
h_surface, h_nbp, h_nbch, h_extra, h_phone, h_link])
f.close()