Longest Duplicate Substring

hard binary search rolling hash string

Problem

Given a string s, find a duplicated substring — a contiguous substring that occurs (possibly overlapping) two or more times in s — of the longest possible length. If no duplicated substring exists, return the empty string. If several have the maximum length, any one of them is acceptable.

Inputs = "banana"

Output"ana"

"ana" appears at index 1 and index 3 (overlapping). No length-4 substring repeats, so 3 is the maximum.

s (lowercase letters)

def longest_dup_substring(s):
    MOD, BASE = (1 << 61) - 1, 131
    n = len(s)

    def has_dup(L):
        if L == 0:
            return 0
        h, p = 0, pow(BASE, L, MOD)
        for i in range(L):
            h = (h * BASE + ord(s[i])) % MOD
        seen = {h: [0]}
        for i in range(L, n):
            h = (h * BASE + ord(s[i]) - ord(s[i - L]) * p) % MOD
            start = i - L + 1
            if h in seen:
                for j in seen[h]:
                    if s[j:j + L] == s[start:start + L]:
                        return start
                seen[h].append(start)
            else:
                seen[h] = [start]
        return -1

    lo, hi, best = 1, n - 1, 0
    while lo <= hi:
        mid = (lo + hi) // 2
        pos = has_dup(mid)
        if pos != -1:
            best, lo = pos, mid + 1
        else:
            hi = mid - 1
    L = hi
    return s[best:best + L]

function longestDupSubstring(s) {
  const MOD = 1125899906842597n, BASE = 131n;
  const n = s.length;

  function hasDup(L) {
    if (L === 0) return 0;
    let h = 0n, p = 1n;
    for (let k = 0; k < L; k++) p = (p * BASE) % MOD;
    for (let i = 0; i < L; i++) h = (h * BASE + BigInt(s.charCodeAt(i))) % MOD;
    const seen = new Map([[h, [0]]]);
    for (let i = L; i < n; i++) {
      h = (h * BASE + BigInt(s.charCodeAt(i)) - BigInt(s.charCodeAt(i - L)) * p) % MOD;
      h = ((h % MOD) + MOD) % MOD;
      const start = i - L + 1;
      if (seen.has(h)) {
        for (const j of seen.get(h))
          if (s.substr(j, L) === s.substr(start, L)) return start;
        seen.get(h).push(start);
      } else seen.set(h, [start]);
    }
    return -1;
  }

  let lo = 1, hi = n - 1, best = 0;
  while (lo <= hi) {
    const mid = (lo + hi) >> 1;
    const pos = hasDup(mid);
    if (pos !== -1) { best = pos; lo = mid + 1; }
    else hi = mid - 1;
  }
  return s.substr(best, hi);
}

class Solution {
    long MOD = 1125899906842597L, BASE = 131L;
    String s; int n;

    int hasDup(int L) {
        if (L == 0) return 0;
        long h = 0, p = 1;
        for (int k = 0; k < L; k++) p = (p * BASE) % MOD;
        for (int i = 0; i < L; i++) h = (h * BASE + s.charAt(i)) % MOD;
        Map<Long, List<Integer>> seen = new HashMap<>();
        seen.put(h, new ArrayList<>(List.of(0)));
        for (int i = L; i < n; i++) {
            h = (h * BASE + s.charAt(i) - s.charAt(i - L) * p) % MOD;
            h = ((h % MOD) + MOD) % MOD;
            int start = i - L + 1;
            if (seen.containsKey(h)) {
                for (int j : seen.get(h))
                    if (s.substring(j, j + L).equals(s.substring(start, start + L))) return start;
                seen.get(h).add(start);
            } else seen.put(h, new ArrayList<>(List.of(start)));
        }
        return -1;
    }

    public String longestDupSubstring(String str) {
        s = str; n = s.length();
        int lo = 1, hi = n - 1, best = 0;
        while (lo <= hi) {
            int mid = (lo + hi) / 2, pos = hasDup(mid);
            if (pos != -1) { best = pos; lo = mid + 1; }
            else hi = mid - 1;
        }
        return s.substring(best, best + hi);
    }
}

class Solution {
    string s; int n;
    const long long MOD = 1125899906842597LL, BASE = 131;

    int hasDup(int L) {
        if (L == 0) return 0;
        long long h = 0, p = 1;
        for (int k = 0; k < L; k++) p = (__int128)p * BASE % MOD;
        for (int i = 0; i < L; i++) h = ((__int128)h * BASE + s[i]) % MOD;
        unordered_map<long long, vector<int>> seen;
        seen[h] = {0};
        for (int i = L; i < n; i++) {
            h = ((__int128)h * BASE + s[i] - (__int128)s[i - L] * p) % MOD;
            h = (h % MOD + MOD) % MOD;
            int start = i - L + 1;
            if (seen.count(h)) {
                for (int j : seen[h])
                    if (s.compare(j, L, s, start, L) == 0) return start;
                seen[h].push_back(start);
            } else seen[h] = {start};
        }
        return -1;
    }
public:
    string longestDupSubstring(string str) {
        s = str; n = s.size();
        int lo = 1, hi = n - 1, best = 0;
        while (lo <= hi) {
            int mid = (lo + hi) / 2, pos = hasDup(mid);
            if (pos != -1) { best = pos; lo = mid + 1; }
            else hi = mid - 1;
        }
        return s.substr(best, hi);
    }
};

Explanation

Two ideas combine here. First, binary-search the length of the answer: if a duplicated substring of length L exists, then a shorter one always exists too, so the "does a repeat of length L exist" property is monotone and searchable.

Second, to test a single length quickly we use a Rabin-Karp rolling hash. We compute a hash for the first window of length L, then slide it one character at a time, updating the hash in O(1) instead of recomputing from scratch. Each window's hash goes into a map; if the same hash appears again we have a likely duplicate.

Because hashes can collide, when two windows share a hash we do a real character comparison to confirm they are truly equal before returning the start index. This keeps the result correct.

The outer loop keeps the best start index found. If length mid has a duplicate we record it and try longer (lo = mid + 1); if not we shrink (hi = mid - 1). At the end we slice the best substring out of s.

Example: s = "banana". Length 3 finds "ana" repeating at indices 1 and 3, but length 4 finds nothing, so the answer is "ana".

Time: O(n log n) Space: O(n)