# =============================================================================
# Setup
# =============================================================================
import pandas as pd
import numpy as np
import os, time, random
from io import StringIO
# Import the necessary modules from the Selenium library
from selenium import webdriver # Main module to control the browser
from selenium.webdriver.common.by import By # Helps locate elements on the webpage
from selenium.webdriver.chrome.options import Options # Allows setting browser options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
# Set the working directory path
wd_path = '/Users/bchoe/Documents/DANL-210' # e.g., '/Users/bchoe/Documents/DANL-210'
os.chdir(wd_path) # Change the current working directory to wd_path
os.getcwd() # Retrieve and return the current working directory
# Create an instance of Chrome options
options = Options()
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)
driver.get('https://books.toscrape.com')
time.sleep(random.uniform(1, 2))
xpath_category_side = '/html/body/div/div/div/aside/div[2]/ul/li/ul'
category_side = driver.find_element(By.XPATH, xpath_category_side)
categories = category_side.find_elements(By.TAG_NAME, 'li')
n_categories = len(categories)
# Figuring out web-scrapping strategies:
# categories[0].find_elements(By.TAG_NAME, 'a')
# xpath_cat = '/html/body/div/div/div/aside/div[2]/ul/li/ul/li[1]/a'
# cat = driver.find_element(By.XPATH, xpath_cat)
# cat.click()
df = pd.DataFrame()
for i in range(1, n_categories + 1):
xpath_cat = f'/html/body/div/div/div/aside/div[2]/ul/li/ul/li[{i}]/a'
cat = driver.find_element(By.XPATH, xpath_cat)
cat.click()
j = i - 1
# To deal with
# StaleElementReferenceException from finding the `category_side`
# StaleElementReferenceException occurs when a previously located element
# is no longer attached to the DOM (e.g., page reloaded or DOM updated).
# Fix: re-locate the element before interacting with it.
category_side = driver.find_element(By.XPATH, xpath_category_side)
categories = category_side.find_elements(By.TAG_NAME, 'li')
category = categories[j].text
while True:
try:
next_btn = driver.find_element(By.PARTIAL_LINK_TEXT, "next")
except:
next_btn = []
books = driver.find_elements(By.TAG_NAME, 'h3')
prices = driver.find_elements(By.CLASS_NAME, 'price_color')
for item in range(len(books)):
title = books[item].find_element(By.TAG_NAME, 'a').get_attribute("title")
price = prices[item].text
lst = [category, title, price]
obs = pd.DataFrame([lst])
df = pd.concat([df, obs], ignore_index=True)
if next_btn != []:
next_btn.click()
time.sleep(random.uniform(1, 2))
else:
break
df.columns = ['category', 'title', 'price']
df.to_csv('data/books.csv', index=False)Classwork 6
Scrapping Book Data with Python selenium
Setup
Below is to set up the web scrapping environment with Python selenium:
import pandas as pd
import numpy as np
import os, time, random
from io import StringIO
# Import the necessary modules from the Selenium library
from selenium import webdriver # Main module to control the browser
from selenium.webdriver.common.by import By # Helps locate elements on the webpage
from selenium.webdriver.chrome.options import Options # Allows setting browser options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
# Set the working directory path
wd_path = 'ABSOLUTE_PATHNAME_OF_YOUR_WORKING_DIRECTORY' # e.g., '/Users/bchoe/Documents/DANL-210'
os.chdir(wd_path) # Change the current working directory to wd_path
os.getcwd() # Retrieve and return the current working directory
# Create an instance of Chrome options
options = Options()
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)Goal
You will scrape all books from https://books.toscrape.com using Python selenium, then build a DataFrame:
- Book table: one row per book
You will export a DataFrame as a CSV file.
Target
Create a DataFrame with one row per book and these columns:
category: category of a booktitle: title of a bookprice: price of a book
Export as: data/books.csv
Discussion
Welcome to our Classwork 6 Discussion Board! π
This space is designed for you to engage with your classmates about the material covered in Classwork 6.
Whether you are looking to delve deeper into the content, share insights, or have questions about the content, this is the perfect place for you.
If you have any specific questions for Byeong-Hak (@bcdanl) regarding the Classwork 6 materials or need clarification on any points, donβt hesitate to ask here.
All comments will be stored here.
Letβs collaborate and learn from each other!