Code as a Liberal Art, Spring 2021: Unit 2, Exercise 3 lesson

import requests
from bs4 import BeautifulSoup

import networkx as nx
import matplotlib.pyplot as plt

import re

url_queue = []

url_queue.append("https://nytimes.com")
pages = {}

# Define the function:
def process_next_url():

    next_url = url_queue.pop(0)

    # If we have already processed this URL, don't process again
    if next_url in pages:
        return
    
    response = requests.get(next_url)

    soup = BeautifulSoup(response.text, "html.parser")

    sections = soup.find_all("section", class_="story-wrapper")

    if len(sections) == 0:
        # then we're processing a page that's not the front
        sections = soup.find_all("section",attrs={"name": "articleBody"})
    
    linked_pages = []
    for section in sections:
        url = section.a["href"]

        # if we have a link to a relative URL
        if not url.startswith("http"):
            # then get relevant bits of the original URL
            prefix = re.sub(r'/[^/]+$','/',next_url)

            # and make the relative URL into absolute:
            url = prefix + url

        url_queue.append(url)
        linked_pages.append(url)

    try:
        title = soup.title.string
    except:
        title = "blank"
    pages[next_url] = { "title": title, "linked_pages": linked_pages }


# Call the function
while len(url_queue) > 0 and len(pages) < 30:
    process_next_url()


# Visualize
print("Visualizing now")

G = nx.Graph()

for key in pages:
    for link in pages[key]["linked_pages"]:
        G.add_edge(key,link)

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()