diff --git a/chromedriver.exe b/chromedriver.exe new file mode 100644 index 0000000..ae80772 Binary files /dev/null and b/chromedriver.exe differ diff --git a/scrapImo.exe b/scrapImo.exe new file mode 100644 index 0000000..31bd4f9 Binary files /dev/null and b/scrapImo.exe differ diff --git a/scrapImo.py b/scrapImo.py new file mode 100644 index 0000000..12de150 --- /dev/null +++ b/scrapImo.py @@ -0,0 +1,101 @@ +from bs4 import BeautifulSoup as bs +from selenium import webdriver +import csv +import os + +filename = 'offers.csv' + +if os.path.exists(filename): + os.remove(filename) + +headers = ['Type', 'Prix', 'Lieu', 'Quartier', 'Surface', + 'Nb_pieces', 'Nb_chambres', 'Extra', 'Numero', 'Lien'] + +with open(filename, 'w') as f: + writer = csv.writer(f) + writer.writerow(headers) + +f.close() + +cp = input("Code postal ?\n") +max_page = input("Jusqu a quelle page ?\n") +choice = raw_input("Entrer a pour achat ou l pour location\n") + +buy = 'https://www.seloger.com/list.htm?enterprise=0&natures=1,4&places=%5b%7bcp%3a' + \ + str(cp) + '%7d%5d&projects=2&qsversion=1.0&rooms=1,2,3,4&types=1' +rent = 'https://www.seloger.com/list.htm?enterprise=0&furnished=0&places=%5b%7bcp%3a' + \ + str(cp) + '%7d%5d&projects=1&qsversion=1.0&rooms=1,2,3,4&types=1' + +if choice.lower() in ['achat', 'a']: + adress = buy +elif choice.lower() in ['location', 'l']: + adress = rent + +for x in xrange(1, max_page): + url = adress + '&LISTING-LISTpg=' + str(x) + + options = webdriver.ChromeOptions() + options.add_argument('headless') + browser = webdriver.Chrome(chrome_options=options) + browser.get(url) + html = browser.page_source +# browser.close() + + soup = bs(html, "html.parser") + + containers = soup.findAll("div", {"class": ["c-pa-list c-pa-sl c-pa-gold cartouche ", + "c-pa-list c-pa-bd c-pa-gold cartouche ", "c-pa-list c-pa-sl c-pa-silver cartouche ", + "c-pa-list c-pa-bd c-pa-silver cartouche ", "c-pa-list c-pa-sl cartouche "]}) + + for c in containers: + info_c = c.findAll("div", {"class": "c-pa-info"})[0] + + h_link = info_c.a["href"].strip() + h_quartier = h_link.split('/')[-2].replace('-', ' ') + if 'paris' in h_quartier: + h_quartier = h_link.split('/')[-1].replace('-', ' ') + if '?' in h_quartier: + h_quartier = '' + + h_type = info_c.a.text + h_price = info_c.findAll( + "span", {"class": "c-pa-cprice"})[0].text.strip().encode('ascii', errors='ignore') + h_loc = info_c.findAll("div", { + "class": "c-pa-city"})[0].text.encode('ascii', errors='ignore').replace('me', '') + if len(info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})) != 0: + h_phone = info_c.findAll("a", {"class": " tagClick desktop listContactPhone"})[ + 0]["data-tooltip-focus"] + + h_nbp = "" + h_nbch = "" + h_extra = "" + h_surface = "" + + print(h_type) + print(h_price) + print(h_loc) + print(h_quartier) + + for info in info_c.div.text.strip().split('\n'): + if "m" in info: + h_surface = info.encode( + 'ascii', errors='ignore').replace(' m', '') + print(h_surface) + elif "p" in info: + h_nbp = info.replace(' p', '') + print(h_nbp) + elif "ch" in info: + h_nbch = info.replace(' ch', '') + print(h_nbch) + else: + h_extra = info.encode('ascii', errors='ignore') + print(h_extra) + print(h_phone) + print("") + + with open(filename, 'a') as f: + writer = csv.writer(f) + writer.writerow([h_type, h_price, h_loc, h_quartier, + h_surface, h_nbp, h_nbch, h_extra, h_phone, h_link]) + + f.close()