Repeated DNA Sequences

medium string hash map sliding window

Problem

Given a DNA string s (over {A, C, G, T}), return every 10-letter substring that appears more than once. Order doesn't matter; each repeated substring appears once in the result.

Inputs = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTT"
Output["AAAAACCCCC", "CCCCCAAAAA"]
Slide a 10-char window across s. Use a hash set "seen" and a second set "added" so duplicates are emitted exactly once. (We use k=4 here so it's visible on small inputs.)

def find_repeated_dna_sequences(s):
    seen, out = set(), set()
    for i in range(len(s) - 9):
        sub = s[i:i + 10]
        if sub in seen:
            out.add(sub)
        else:
            seen.add(sub)
    return list(out)
function findRepeatedDnaSequences(s) {
  const seen = new Set(), out = new Set();
  for (let i = 0; i + 10 <= s.length; i++) {
    const sub = s.slice(i, i + 10);
    if (seen.has(sub)) out.add(sub);
    else seen.add(sub);
  }
  return [...out];
}
class Solution {
    public List<String> findRepeatedDnaSequences(String s) {
        Set<String> seen = new HashSet<>(), out = new HashSet<>();
        for (int i = 0; i + 10 <= s.length(); i++) {
            String sub = s.substring(i, i + 10);
            if (!seen.add(sub)) out.add(sub);
        }
        return new ArrayList<>(out);
    }
}
vector<string> findRepeatedDnaSequences(string s) {
    unordered_set<string> seen, out;
    for (int i = 0; i + 10 <= (int) s.size(); i++) {
        string sub = s.substr(i, 10);
        if (!seen.insert(sub).second) out.insert(sub);
    }
    return { out.begin(), out.end() };
}
Time: O(n) Space: O(n)