import requests from bs4 import BeautifulSoup import networkx as nx import matplotlib.pyplot as plt import re url_queue = [] url_queue.append("https://nytimes.com") pages = {} # Define the function: def process_next_url(): next_url = url_queue.pop(0) # If we have already processed this URL, don't process again if next_url in pages: return response = requests.get(next_url) soup = BeautifulSoup(response.text, "html.parser") sections = soup.find_all("section", class_="story-wrapper") if len(sections) == 0: # then we're processing a page that's not the front sections = soup.find_all("section",attrs={"name": "articleBody"}) linked_pages = [] for section in sections: url = section.a["href"] # if we have a link to a relative URL if not url.startswith("http"): # then get relevant bits of the original URL prefix = re.sub(r'/[^/]+$','/',next_url) # and make the relative URL into absolute: url = prefix + url url_queue.append(url) linked_pages.append(url) try: title = soup.title.string except: title = "blank" pages[next_url] = { "title": title, "linked_pages": linked_pages } # Call the function while len(url_queue) > 0 and len(pages) < 30: process_next_url() # Visualize print("Visualizing now") G = nx.Graph() for key in pages: for link in pages[key]["linked_pages"]: G.add_edge(key,link) nx.draw(G, with_labels=True, font_weight='bold') plt.show()