Web Crawler

medium graph dfs / bfs hostname

Problem

Given a startUrl and an HtmlParser that returns the links found on any page, crawl every URL reachable from startUrl that shares the same hostname. The hostname is the part after http:// up to the next /. Return all crawled URLs in any order, visiting each page at most once.

InputstartUrl = "http://a.com/1", edges: 1→2, 1→x.com, 2→3
Output["http://a.com/1", "http://a.com/2", "http://a.com/3"]
x.com has a different hostname, so it is skipped; pages 1, 2, 3 share host a.com and are all reached.

def crawl(start_url, html_parser):
    host = start_url.split('/')[2]
    visited = {start_url}
    stack = [start_url]
    while stack:
        url = stack.pop()
        for link in html_parser.get_urls(url):
            if link.split('/')[2] == host and link not in visited:
                visited.add(link)
                stack.append(link)
    return list(visited)
function crawl(startUrl, htmlParser) {
  const host = startUrl.split('/')[2];
  const visited = new Set([startUrl]);
  const stack = [startUrl];
  while (stack.length) {
    const url = stack.pop();
    for (const link of htmlParser.getUrls(url)) {
      if (link.split('/')[2] === host && !visited.has(link)) {
        visited.add(link);
        stack.push(link);
      }
    }
  }
  return [...visited];
}
class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        String host = startUrl.split("/")[2];
        Set<String> visited = new HashSet<>();
        Deque<String> stack = new ArrayDeque<>();
        visited.add(startUrl);
        stack.push(startUrl);
        while (!stack.isEmpty()) {
            String url = stack.pop();
            for (String link : htmlParser.getUrls(url)) {
                if (link.split("/")[2].equals(host) && visited.add(link)) {
                    stack.push(link);
                }
            }
        }
        return new ArrayList<>(visited);
    }
}
vector<string> crawl(string startUrl, HtmlParser& htmlParser) {
    string host = hostOf(startUrl);
    unordered_set<string> visited{startUrl};
    vector<string> stack{startUrl};
    while (!stack.empty()) {
        string url = stack.back(); stack.pop_back();
        for (const string& link : htmlParser.getUrls(url)) {
            if (hostOf(link) == host && !visited.count(link)) {
                visited.insert(link);
                stack.push_back(link);
            }
        }
    }
    return {visited.begin(), visited.end()};
}
Time: O(V + E) Space: O(V)