import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import os # Import the os module to check for file existence
# IMPORTANT: Before running this script, you need to:
# 1. Install Selenium: pip install selenium pandas
# 2. Download the appropriate WebDriver for your browser (e.g., ChromeDriver for Chrome).
# You can find ChromeDriver here: https://chromedriver.chromium.org/downloads
# 3. Place the WebDriver executable in a directory that's in your system's PATH,
# or specify its path directly in the Service object (e.g., Service(executable_path='/path/to/chromedriver')).
def initialize_driver():
"""Initializes and returns a headless Chrome WebDriver."""
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run in headless mode (no browser UI)
options.add_argument('--disable-gpu') # Required for headless on some systems
options.add_argument('--no-sandbox') # Bypass OS security model, required for some environments
options.add_argument('--disable-dev-shm-usage') # Overcome limited resource problems
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
# If chromedriver is not in PATH, specify its path here:
# service = Service(executable_path='/path/to/your/chromedriver')
# driver = webdriver.Chrome(service=service, options=options)
# If chromedriver is in PATH:
driver = webdriver.Chrome(options=options)
return driver
def get_institution_links(base_schools_partners_url="https://www.edx.org/schools-partners"):
"""
Scrapes the base edX schools-partners page to get the actual links for each institution
using Selenium to handle dynamically loaded content.
Returns a dictionary mapping institution names to their full edX profile URLs.
"""
print(f"Fetching institution links from: {base_schools_partners_url} using Selenium...")
driver = None
institution_links = {}
try:
driver = initialize_driver()
driver.get(base_schools_partners_url)
# Wait for the page to load and for institution elements to be present
wait = WebDriverWait(driver, 20)
# Try multiple selectors for institution links
selectors_to_try = [
'a[href*="/school/"]', # Links containing "/school/" in href
'a[data-testid*="institution"]', # Data testid containing "institution"
'.institution-card a', # Anchor tags within institution card elements
'[class*="institution"] a', # Anchor tags within elements with "institution" in class name
'a[href*="edx.org/school/"]' # Full edX school URLs
]
institution_elements = []
for selector in selectors_to_try:
try:
print(f"Trying selector: {selector}")
institution_elements = wait.until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
)
if institution_elements:
print(f"Found {len(institution_elements)} elements with selector: {selector}")
break
except TimeoutException:
print(f"Selector {selector} timed out, trying next...")
continue
if not institution_elements:
print("No institution elements found with any selector. Page structure might have changed.")
return institution_links
for element in institution_elements:
try:
href = element.get_attribute('href')
if href and '/school/' in href:
# Extract institution name from the link text or data attributes
institution_name = element.text.strip()
# If link text is empty, try to get name from parent element or other attributes
if not institution_name:
parent = element.find_element(By.XPATH, '..')
institution_name = parent.text.strip()
# If still empty, extract from URL
if not institution_name:
# Extract institution name from URL like "/school/harvardx" -> "harvardx"
url_parts = href.split('/school/')
if len(url_parts) > 1:
institution_name = url_parts[1].split('/')[0].split('?')[0]
if institution_name and href:
institution_links[institution_name] = href
print(f"Found institution: {institution_name} -> {href}")
except Exception as e:
print(f"Error processing institution element: {str(e)}")
continue
print(f"Total institutions found: {len(institution_links)}")
except Exception as e:
print(f"Error fetching institution links: {str(e)}")
finally:
if driver:
driver.quit()
return institution_links
def get_first_course_title(driver, institution_url):
"""
Navigates to an institution's edX page and extracts the title of the first course
using multiple fallback selectors for robustness.
Returns the course title as a string, or an empty string if no course is found.
"""
try:
print(f"Navigating to: {institution_url}")
driver.get(institution_url)
# Wait for page to load
time.sleep(3)
# Try multiple selectors for course titles in order of preference
selectors_to_try = [
'h3[data-testid="course-title-popover-trigger"]', # Most specific selector
'.course-title', # Generic course title class
'h3.course-name', # Course name heading
'[class*="course"][class*="title"]', # Any element with both "course" and "title" in class
'.course-card h3', # H3 within course card
'.course-list h3', # H3 within course list
'h3:contains("Course")', # H3 containing the word "Course"
'.card-title', # Generic card title
'h2.course-title', # H2 course title
'h4.course-title', # H4 course title
'[data-testid*="course"] h3', # H3 within element with course in data-testid
'.course h3', # H3 within course element
'article h3', # H3 within article (course cards often use article)
]
for selector in selectors_to_try:
try:
# Use WebDriverWait for better reliability
course_title_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
course_title = course_title_element.text.strip()
if course_title: # Only return if we actually found text
print(f"Found course title with selector '{selector}': {course_title}")
return course_title
except (TimeoutException, NoSuchElementException):
continue # Try the next selector
# If no course title found with any selector
print(f"No course title found for {institution_url}")
return ""
except Exception as e:
print(f"Error getting course title for {institution_url}: {str(e)}")
return ""
def scrape_edx_institutions():
"""
Main function that orchestrates the scraping process.
Gets institution links, then scrapes the first course title for each institution.
Saves results incrementally to a CSV file.
"""
# Get institution links
institution_links = get_institution_links()
if not institution_links:
print("No institution links found. Exiting.")
return
# Prepare CSV file
csv_filename = "edx_institution_courses.csv"
# Check if CSV file already exists
file_exists = os.path.isfile(csv_filename)
# If file doesn't exist, create header
if not file_exists:
df_header = pd.DataFrame(columns=["Institution", "First Course Offered"])
df_header.to_csv(csv_filename, index=False)
print(f"Created new CSV file: {csv_filename}")
else:
print(f"Appending to existing CSV file: {csv_filename}")
# Initialize WebDriver for course scraping
driver = None
try:
driver = initialize_driver()
# Process each institution
for institution_name, institution_url in institution_links.items():
print(f"\nProcessing: {institution_name}")
# Get the first course title
first_course = get_first_course_title(driver, institution_url)
# Create a DataFrame for this row
row_data = pd.DataFrame({
"Institution": [institution_name],
"First Course Offered": [first_course]
})
# Append to CSV file
row_data.to_csv(csv_filename, mode='a', header=False, index=False)
print(f"Saved: {institution_name} -> {first_course}")
# Add a small delay between requests to be respectful
time.sleep(2)
except Exception as e:
print(f"Error during scraping: {str(e)}")
finally:
if driver:
driver.quit()
print(f"\nScraping completed. Results saved to {csv_filename}")
if __name__ == "__main__":
scrape_edx_institutions()