# %%
# =============================================================================
# Setup
# =============================================================================
import pandas as pd
import os, time, random
from io import StringIO
# Import the necessary modules from the Selenium library
from selenium import webdriver # Main module to control the browser
from selenium.webdriver.common.by import By # Helps locate elements on the webpage
from selenium.webdriver.chrome.options import Options # Allows setting browser options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
# Set the working directory path
wd_path = '/Users/bchoe/Documents/DANL-210' # e.g., '/Users/bchoe/Documents/DANL-210'
os.chdir(wd_path) # Change the current working directory to wd_path
os.getcwd() # Retrieve and return the current working directory
# Create an instance of Chrome options
options = Options()
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)
# =============================================================================
# Question 1
# =============================================================================
url = 'https://quotes.toscrape.com/'
driver.get(url)
# %%
# =============================================================================
# Figuring out web-scrapping strategies
# =============================================================================
# When "available", `id` and `class` are often the most convenient ways to locate web elements.
# Of course, XPath would also work.
quotes = driver.find_elements(By.CLASS_NAME, 'text') # list of 10 WebElements
quotes[0].text
quotes[1].text
quotes[2].text
authors = driver.find_elements(By.CLASS_NAME, 'author') # list of 10 WebElements
authors[0].text
authors[1].text
authors[2].text
authors[9].text
tags_all = driver.find_elements(By.CLASS_NAME, 'tags') # list of 10 WebElements
tags_all[0].text
# Out[133]: 'Tags: change deep-thoughts thinking world'
tags_all[1].text
abouts = driver.find_elements(By.LINK_TEXT, '(about)') # list of 10 WebElements
abouts[0].get_attribute('href')
abouts[1].get_attribute('href')
abouts[2].get_attribute('href')
abouts[4].get_attribute('href')
# %%
# =============================================================================
# For-loop construction
# =============================================================================
df = pd.DataFrame()
for p in range(1, 11):
f_url = f'https://quotes.toscrape.com/page/{p}'
driver.get(f_url)
time.sleep( random.uniform(1, 2) ) # Be polite: pause between requests to reduce load on the server
quotes = driver.find_elements(By.CLASS_NAME, 'text')
authors = driver.find_elements(By.CLASS_NAME, 'author')
tags_all = driver.find_elements(By.CLASS_NAME, 'tags')
abouts = driver.find_elements(By.LINK_TEXT, '(about)')
for item in range( len(quotes) ):
quote = quotes[item].text
author = authors[item].text
tags = tags_all[item].text
about = abouts[item].get_attribute('href')
obs_lst = [quote, author, tags, about]
obs_df = pd.DataFrame([obs_lst])
df = pd.concat([df, obs_df])
df.columns =['quote', 'author', 'tags', 'about']
df.to_csv('data/quotes_2026_0225.csv', index = False)Classwork 5
Scrapping Quote Data with Python selenium
Setup
Below is to set up the web scrapping environment with Python selenium:
import pandas as pd
import numpy as np
import os, time, random
from io import StringIO
# Import the necessary modules from the Selenium library
from selenium import webdriver # Main module to control the browser
from selenium.webdriver.common.by import By # Helps locate elements on the webpage
from selenium.webdriver.chrome.options import Options # Allows setting browser options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
# Set the working directory path
wd_path = 'ABSOLUTE_PATHNAME_OF_YOUR_WORKING_DIRECTORY' # e.g., '/Users/bchoe/Documents/DANL-210'
os.chdir(wd_path) # Change the current working directory to wd_path
os.getcwd() # Retrieve and return the current working directory
# Create an instance of Chrome options
options = Options()
# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)Goal
You will scrape all quotes from https://quotes.toscrape.com using Python selenium, then build two DataFrames:
- Part 1 (quotes table): one row per quote
- Part 2 (authors table): one row per author profile page
You will export both DataFrames as CSV files.
Part 1. Quotes DataFrame
Target
Create a DataFrame with one row per quote and these columns:
quote(the quote text)author(author name)tags(all tags for the quote; store as one string like"life|humor|inspirational")about(URL to the author’s profile page)
Export as: data/quotes.csv
Question 1. URL pattern (pages 1 to 10)
Suppose we know the page URL format:
- Page format:
https://quotes.toscrape.com/page/{PAGE}/ - Example: https://quotes.toscrape.com/page/1/
Write a loop that visits pages 1 to 10 and collects all quotes.
Question 2. Unknown page count (click “Next” until it disappears)
Now suppose you do not know how many pages exist.
Instead of guessing the last page number, do this:
- Start at page 1
- Scrape the quotes on the current page
- Click the Next button
- Repeat until the Next button is not found
url = 'https://quotes.toscrape.com/'
driver.get(url)
df = pd.DataFrame()
while True:
# for p in range(1, 11):
# f_url = f'https://quotes.toscrape.com/page/{p}'
# driver.get(f_url)
try:
next_btn = driver.find_element(By.PARTIAL_LINK_TEXT,
"Next")
except:
next_btn = []
quotes = driver.find_elements(By.CLASS_NAME, 'text')
authors = driver.find_elements(By.CLASS_NAME, 'author')
tags_all = driver.find_elements(By.CLASS_NAME, 'tags')
abouts = driver.find_elements(By.LINK_TEXT, '(about)')
for item in range( len(quotes) ):
quote = quotes[item].text
author = authors[item].text
tags = tags_all[item].text
about = abouts[item].get_attribute('href')
obs_lst = [quote, author, tags, about]
obs_df = pd.DataFrame([obs_lst])
df = pd.concat([df, obs_df])
if next_btn != []:
next_btn.click()
else:
break
time.sleep( random.uniform(1, 2) ) # Be polite: pause between requests to reduce load on the server
df.columns =['quote', 'author', 'tags', 'about']
df.to_csv('data/quotes_2026_0227.csv', index = False)Discussion
Welcome to our Classwork 5 Discussion Board! 👋
This space is designed for you to engage with your classmates about the material covered in Classwork 5.
Whether you are looking to delve deeper into the content, share insights, or have questions about the content, this is the perfect place for you.
If you have any specific questions for Byeong-Hak (@bcdanl) regarding the Classwork 5 materials or need clarification on any points, don’t hesitate to ask here.
All comments will be stored here.
Let’s collaborate and learn from each other!