Web Crawler Multithreaded

medium graph bfs concurrency

Problem

Given a startUrl and an htmlParser that returns the links found on any page, crawl every URL that shares the same hostname as startUrl. A hostname is the part of the URL between http:// and the next / (e.g. the hostname of http://news.com/a is news.com). Each page should be fetched at most once. Return all reachable same-host URLs in any order. The reference solution uses multiple threads, but the underlying algorithm is a graph traversal that guards a shared visited set.

InputstartUrl = "http://news.com/", edges: news.com/ → {news.com/a, blog.org/x}; news.com/a → {news.com/b}; news.com/b → {news.com/}
Output["http://news.com/", "http://news.com/a", "http://news.com/b"]
blog.org/x is reachable but has a different hostname, so it is skipped. Same-host pages are visited exactly once even when links form a cycle.

from concurrent.futures import ThreadPoolExecutor

class Solution:
    def crawl(self, startUrl, htmlParser):
        host = startUrl.split('/')[2]
        visited = {startUrl}
        with ThreadPoolExecutor(max_workers=8) as pool:
            def worker(url):
                fresh = []
                for link in htmlParser.getUrls(url):
                    if link.split('/')[2] == host and link not in visited:
                        visited.add(link)
                        fresh.append(link)
                return list(pool.map(worker, fresh)) and None
            worker(startUrl)
        return list(visited)
function crawl(startUrl, htmlParser) {
  const host = startUrl.split('/')[2];
  const visited = new Set([startUrl]);
  const queue = [startUrl];
  while (queue.length) {
    const url = queue.shift();
    for (const link of htmlParser.getUrls(url)) {
      if (link.split('/')[2] === host && !visited.has(link)) {
        visited.add(link);
        queue.push(link);
      }
    }
  }
  return [...visited];
}
class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        String host = startUrl.split("/")[2];
        Set<String> visited = ConcurrentHashMap.newKeySet();
        visited.add(startUrl);
        Deque<String> queue = new ArrayDeque<>();
        queue.add(startUrl);
        while (!queue.isEmpty()) {
            String url = queue.poll();
            for (String link : htmlParser.getUrls(url)) {
                if (link.split("/")[2].equals(host) && visited.add(link)) {
                    queue.add(link);
                }
            }
        }
        return new ArrayList<>(visited);
    }
}
class Solution {
public:
    vector<string> crawl(string startUrl, HtmlParser& htmlParser) {
        string host = hostOf(startUrl);
        unordered_set<string> visited{startUrl};
        queue<string> q;
        q.push(startUrl);
        while (!q.empty()) {
            string url = q.front(); q.pop();
            for (auto& link : htmlParser.getUrls(url)) {
                if (hostOf(link) == host && !visited.count(link)) {
                    visited.insert(link);
                    q.push(link);
                }
            }
        }
        return {visited.begin(), visited.end()};
    }
    string hostOf(const string& u) {
        int s = u.find("//") + 2;
        int e = u.find('/', s);
        return u.substr(s, e - s);
    }
};
Time: O(V + E) Space: O(V)