Web Crawler Multithreaded
Problem
Given a startUrl and an htmlParser that returns the links found on any page, crawl every URL that shares the same hostname as startUrl. A hostname is the part of the URL between http:// and the next / (e.g. the hostname of http://news.com/a is news.com). Each page should be fetched at most once. Return all reachable same-host URLs in any order. The reference solution uses multiple threads, but the underlying algorithm is a graph traversal that guards a shared visited set.
startUrl = "http://news.com/", edges: news.com/ → {news.com/a, blog.org/x}; news.com/a → {news.com/b}; news.com/b → {news.com/}["http://news.com/", "http://news.com/a", "http://news.com/b"]from concurrent.futures import ThreadPoolExecutor
class Solution:
def crawl(self, startUrl, htmlParser):
host = startUrl.split('/')[2]
visited = {startUrl}
with ThreadPoolExecutor(max_workers=8) as pool:
def worker(url):
fresh = []
for link in htmlParser.getUrls(url):
if link.split('/')[2] == host and link not in visited:
visited.add(link)
fresh.append(link)
return list(pool.map(worker, fresh)) and None
worker(startUrl)
return list(visited)
function crawl(startUrl, htmlParser) {
const host = startUrl.split('/')[2];
const visited = new Set([startUrl]);
const queue = [startUrl];
while (queue.length) {
const url = queue.shift();
for (const link of htmlParser.getUrls(url)) {
if (link.split('/')[2] === host && !visited.has(link)) {
visited.add(link);
queue.push(link);
}
}
}
return [...visited];
}
class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String host = startUrl.split("/")[2];
Set<String> visited = ConcurrentHashMap.newKeySet();
visited.add(startUrl);
Deque<String> queue = new ArrayDeque<>();
queue.add(startUrl);
while (!queue.isEmpty()) {
String url = queue.poll();
for (String link : htmlParser.getUrls(url)) {
if (link.split("/")[2].equals(host) && visited.add(link)) {
queue.add(link);
}
}
}
return new ArrayList<>(visited);
}
}
class Solution {
public:
vector<string> crawl(string startUrl, HtmlParser& htmlParser) {
string host = hostOf(startUrl);
unordered_set<string> visited{startUrl};
queue<string> q;
q.push(startUrl);
while (!q.empty()) {
string url = q.front(); q.pop();
for (auto& link : htmlParser.getUrls(url)) {
if (hostOf(link) == host && !visited.count(link)) {
visited.insert(link);
q.push(link);
}
}
}
return {visited.begin(), visited.end()};
}
string hostOf(const string& u) {
int s = u.find("//") + 2;
int e = u.find('/', s);
return u.substr(s, e - s);
}
};
Explanation
At its heart this is a graph traversal: pages are nodes, links are edges, and we explore everything reachable from startUrl that shares the same hostname. The only twist is doing it with multiple threads while keeping a single shared visited set safe.
The hostname is pulled out with startUrl.split('/')[2] — for http://news.com/a that's news.com. Every link we discover is kept only if its hostname matches and we haven't seen it before.
The Python solution uses a ThreadPoolExecutor: a worker fetches one page's links, records any fresh same-host URLs in visited, then recursively dispatches a worker for each new URL via pool.map. The threads fan out across the link graph in parallel.
The shared visited set is what prevents fetching a page twice and stops infinite loops when links form a cycle. Each URL gets added exactly once, so each page is crawled at most once.
Example: from http://news.com/ we reach news.com/a and news.com/b, but blog.org/x is dropped because its host differs. The result is the three same-host URLs.