edX Scraper Source Code Viewer

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import os # Import the os module to check for file existence

# IMPORTANT: Before running this script, you need to:
# 1. Install Selenium: pip install selenium pandas
# 2. Download the appropriate WebDriver for your browser (e.g., ChromeDriver for Chrome).
#    You can find ChromeDriver here: https://chromedriver.chromium.org/downloads
# 3. Place the WebDriver executable in a directory that's in your system's PATH,
#    or specify its path directly in the Service object (e.g., Service(executable_path='/path/to/chromedriver')).

def initialize_driver():
    """Initializes and returns a headless Chrome WebDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode (no browser UI)
    options.add_argument('--disable-gpu') # Required for headless on some systems
    options.add_argument('--no-sandbox') # Bypass OS security model, required for some environments
    options.add_argument('--disable-dev-shm-usage') # Overcome limited resource problems
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    # If chromedriver is not in PATH, specify its path here:
    # service = Service(executable_path='/path/to/your/chromedriver')
    # driver = webdriver.Chrome(service=service, options=options)
    
    # If chromedriver is in PATH:
    driver = webdriver.Chrome(options=options)
    return driver

def get_institution_links(base_schools_partners_url="https://www.edx.org/schools-partners"):
    """
    Scrapes the base edX schools-partners page to get the actual links for each institution
    using Selenium to handle dynamically loaded content.
    Returns a dictionary mapping institution names to their full edX profile URLs.
    """
    print(f"Fetching institution links from: {base_schools_partners_url} using Selenium...")
    driver = None
    institution_links = {}
    try:
        driver = initialize_driver()
        driver.get(base_schools_partners_url)

        # Wait for the page to load and for institution elements to be present
        wait = WebDriverWait(driver, 20)
        
        # Try multiple selectors for institution links
        selectors_to_try = [
            'a[href*="/school/"]',  # Links containing "/school/" in href
            'a[data-testid*="institution"]',  # Data testid containing "institution"
            '.institution-card a',  # Anchor tags within institution card elements
            '[class*="institution"] a',  # Anchor tags within elements with "institution" in class name
            'a[href*="edx.org/school/"]'  # Full edX school URLs
        ]
        
        institution_elements = []
        for selector in selectors_to_try:
            try:
                print(f"Trying selector: {selector}")
                institution_elements = wait.until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                )
                if institution_elements:
                    print(f"Found {len(institution_elements)} elements with selector: {selector}")
                    break
            except TimeoutException:
                print(f"Selector {selector} timed out, trying next...")
                continue
        
        if not institution_elements:
            print("No institution elements found with any selector. Page structure might have changed.")
            return institution_links

        for element in institution_elements:
            try:
                href = element.get_attribute('href')
                if href and '/school/' in href:
                    # Extract institution name from the link text or data attributes
                    institution_name = element.text.strip()
                    
                    # If link text is empty, try to get name from parent element or other attributes
                    if not institution_name:
                        parent = element.find_element(By.XPATH, '..')
                        institution_name = parent.text.strip()
                    
                    # If still empty, extract from URL
                    if not institution_name:
                        # Extract institution name from URL like "/school/harvardx" -> "harvardx"
                        url_parts = href.split('/school/')
                        if len(url_parts) > 1:
                            institution_name = url_parts[1].split('/')[0].split('?')[0]
                    
                    if institution_name and href:
                        institution_links[institution_name] = href
                        print(f"Found institution: {institution_name} -> {href}")
            except Exception as e:
                print(f"Error processing institution element: {str(e)}")
                continue
        
        print(f"Total institutions found: {len(institution_links)}")
        
    except Exception as e:
        print(f"Error fetching institution links: {str(e)}")
    finally:
        if driver:
            driver.quit()
    
    return institution_links

def get_first_course_title(driver, institution_url):
    """
    Navigates to an institution's edX page and extracts the title of the first course
    using multiple fallback selectors for robustness.
    Returns the course title as a string, or an empty string if no course is found.
    """
    try:
        print(f"Navigating to: {institution_url}")
        driver.get(institution_url)
        
        # Wait for page to load
        time.sleep(3)
        
        # Try multiple selectors for course titles in order of preference
        selectors_to_try = [
            'h3[data-testid="course-title-popover-trigger"]',  # Most specific selector
            '.course-title',  # Generic course title class
            'h3.course-name',  # Course name heading
            '[class*="course"][class*="title"]',  # Any element with both "course" and "title" in class
            '.course-card h3',  # H3 within course card
            '.course-list h3',  # H3 within course list
            'h3:contains("Course")',  # H3 containing the word "Course"
            '.card-title',  # Generic card title
            'h2.course-title',  # H2 course title
            'h4.course-title',  # H4 course title
            '[data-testid*="course"] h3',  # H3 within element with course in data-testid
            '.course h3',  # H3 within course element
            'article h3',  # H3 within article (course cards often use article)
        ]
        
        for selector in selectors_to_try:
            try:
                # Use WebDriverWait for better reliability
                course_title_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                course_title = course_title_element.text.strip()
                if course_title:  # Only return if we actually found text
                    print(f"Found course title with selector '{selector}': {course_title}")
                    return course_title
            except (TimeoutException, NoSuchElementException):
                continue  # Try the next selector
        
        # If no course title found with any selector
        print(f"No course title found for {institution_url}")
        return ""
        
    except Exception as e:
        print(f"Error getting course title for {institution_url}: {str(e)}")
        return ""

def scrape_edx_institutions():
    """
    Main function that orchestrates the scraping process.
    Gets institution links, then scrapes the first course title for each institution.
    Saves results incrementally to a CSV file.
    """
    # Get institution links
    institution_links = get_institution_links()
    
    if not institution_links:
        print("No institution links found. Exiting.")
        return
    
    # Prepare CSV file
    csv_filename = "edx_institution_courses.csv"
    
    # Check if CSV file already exists
    file_exists = os.path.isfile(csv_filename)
    
    # If file doesn't exist, create header
    if not file_exists:
        df_header = pd.DataFrame(columns=["Institution", "First Course Offered"])
        df_header.to_csv(csv_filename, index=False)
        print(f"Created new CSV file: {csv_filename}")
    else:
        print(f"Appending to existing CSV file: {csv_filename}")
    
    # Initialize WebDriver for course scraping
    driver = None
    try:
        driver = initialize_driver()
        
        # Process each institution
        for institution_name, institution_url in institution_links.items():
            print(f"\nProcessing: {institution_name}")
            
            # Get the first course title
            first_course = get_first_course_title(driver, institution_url)
            
            # Create a DataFrame for this row
            row_data = pd.DataFrame({
                "Institution": [institution_name],
                "First Course Offered": [first_course]
            })
            
            # Append to CSV file
            row_data.to_csv(csv_filename, mode='a', header=False, index=False)
            print(f"Saved: {institution_name} -> {first_course}")
            
            # Add a small delay between requests to be respectful
            time.sleep(2)
    
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    finally:
        if driver:
            driver.quit()
    
    print(f"\nScraping completed. Results saved to {csv_filename}")

if __name__ == "__main__":
    scrape_edx_institutions()