USACO 2015 February · Gold · all three problems
Problem 1 — Cow Hopscotch (Gold)
Statement
R × C grid, integer labels in 1..K, jumps must be strictly down, strictly right, and to a differently-labelled cell. Count distinct (1, 1) → (R, C) jump-sequences modulo 109+7. Same as Silver but with much larger R, C.
Constraints
- 2 ≤ R, C ≤ 750.
- 1 ≤ K ≤ R · C.
- Output modulo 1 000 000 007.
- Time limit: 2s.
Key idea
Direct O(R²C²) DP is ~3·1011 ops — dead. Rewrite the recurrence as:
f[r][c] = (sum of f[r'][c'] over all r' < r, c' < c) − (sum of f[r'][c'] over r' < r, c' < c, label[r'][c'] == label[r][c]).
The first term is a 2D prefix sum of the full f table — computable incrementally.
The second term is a "per-colour" prefix sum. The standard trick: maintain, for each colour,
a 2D Fenwick tree (BIT) of f; at cell (r, c) query the BIT for label[r][c] over the
rectangle [0..r−1, 0..c−1], subtract from the global prefix, mod, then add f[r][c]
to the global prefix AND to the BIT for label[r][c]. Memory: O(R·C) per colour in the worst case
would blow up, but BITs are allocated lazily only for colours that actually appear (and total
sum of BIT sizes is bounded by R·C entries). Time: O(R·C·log²) ≈ 750·750·10·10 ≈ 5·10⁷.
Complexity target
O(R · C · log R · log C) with per-colour 2D BITs.
Reference solution (C++)
#include <bits/stdc++.h>
using namespace std;
const long long MOD = 1000000007LL;
int R, C, K;
struct BIT2D {
int n, m;
vector<vector<long long>> bit;
void init(int n_, int m_) { n = n_; m = m_; bit.assign(n+1, vector<long long>(m+1, 0)); }
void upd(int x, int y, long long v) {
for (int i = x; i <= n; i += i & -i)
for (int j = y; j <= m; j += j & -j) bit[i][j] = (bit[i][j] + v) % MOD;
}
long long qry(int x, int y) {
long long s = 0;
for (int i = x; i > 0; i -= i & -i)
for (int j = y; j > 0; j -= j & -j) s = (s + bit[i][j]) % MOD;
return s;
}
};
int main() {
ios_base::sync_with_stdio(false);
cin.tie(nullptr);
cin >> R >> C >> K;
vector<vector<int>> g(R, vector<int>(C));
for (int r = 0; r < R; ++r)
for (int c = 0; c < C; ++c) cin >> g[r][c];
vector<BIT2D> bit(K + 1);
for (int k = 1; k <= K; ++k) bit[k].init(R, C); // lazy in spirit; here we just allocate
BIT2D total; total.init(R, C);
vector<vector<long long>> f(R, vector<long long>(C, 0));
f[0][0] = 1;
total.upd(1, 1, 1);
bit[g[0][0]].upd(1, 1, 1);
for (int r = 0; r < R; ++r) {
for (int c = 0; c < C; ++c) {
if (r == 0 && c == 0) continue;
long long all = (r >= 1 && c >= 1) ? total.qry(r, c) : 0;
long long same = (r >= 1 && c >= 1) ? bit[g[r][c]].qry(r, c) : 0;
f[r][c] = (all - same + MOD) % MOD;
total.upd(r + 1, c + 1, f[r][c]);
bit[g[r][c]].upd(r + 1, c + 1, f[r][c]);
}
}
cout << f[R-1][C-1] << "\n";
return 0;
}
Pitfalls
- Per-colour BITs use a lot of memory. Allocating K full BIT2D at K = R·C = 562500 is 5·10⁸ longs — impossible. In practice, allocate BIT2Ds lazily only when a colour first appears; total nonzero memory is O(R·C). The code above is illustrative; the production version must defer
init. - (all − same) can go negative under mod. Add MOD before taking final mod.
- 1-indexed BITs. Convert 0-indexed grid coords to 1-indexed before updating.
- O(RC·log²) is tight but fits. Keep inner loops branchless and avoid unnecessary modulos inside the BIT walk.
Problem 2 — Censoring (Gold)
Statement
Same setup as Bronze/Silver Censoring but with N patterns instead of one. Repeatedly find the earliest occurrence (smallest start index, ties broken by — uniquely determined since no pattern is a substring of another) of any pattern in S, delete it, repeat until S contains no pattern. Output the final S (guaranteed non-empty).
Constraints
- |S| ≤ 105.
- N patterns, total length of patterns ≤ 105.
- No pattern is a substring of another.
- Lowercase a–z only.
- Time limit: 2s.
Key idea
Build an Aho–Corasick automaton over the N patterns. Then the Silver KMP trick generalises: walk
a stack of (character, AC-node) pairs through S; at each step, advance the AC node along the
incoming character (using go[] goto links, including fail links). If the current
node has a "matched pattern of length L" marker, pop L characters off the stack and reset the
current node to the state of the new top. Because no pattern is a substring of another, exactly
one pattern can be matched at a given node — no ambiguity. Linear in |S| + total pattern length.
Complexity target
O((|S| + total |T_i|) · σ) where σ = 26.
Reference solution (C++)
#include <bits/stdc++.h>
using namespace std;
struct AC {
struct Node { int nxt[26]; int fail; int outLen; }; // outLen = matched-pattern length, 0 if none
vector<Node> t;
AC() { newNode(); }
int newNode() {
t.push_back({});
for (int i = 0; i < 26; ++i) t.back().nxt[i] = 0;
t.back().fail = 0; t.back().outLen = 0;
return (int)t.size() - 1;
}
void add(const string& p) {
int u = 0;
for (char c : p) {
int x = c - 'a';
if (!t[u].nxt[x]) t[u].nxt[x] = newNode();
u = t[u].nxt[x];
}
t[u].outLen = (int)p.size();
}
void build() {
queue<int> q;
for (int i = 0; i < 26; ++i) if (t[0].nxt[i]) q.push(t[0].nxt[i]);
while (!q.empty()) {
int u = q.front(); q.pop();
for (int i = 0; i < 26; ++i) {
int v = t[u].nxt[i];
if (v) { t[v].fail = t[t[u].fail].nxt[i]; q.push(v); }
else { t[u].nxt[i] = t[t[u].fail].nxt[i]; }
}
if (!t[u].outLen && t[t[u].fail].outLen) t[u].outLen = t[t[u].fail].outLen;
}
}
};
int main() {
ios_base::sync_with_stdio(false);
cin.tie(nullptr);
string S; cin >> S;
int N; cin >> N;
AC ac;
for (int i = 0; i < N; ++i) { string p; cin >> p; ac.add(p); }
ac.build();
vector<char> outC; outC.reserve(S.size());
vector<int> outN; outN.reserve(S.size());
int u = 0;
for (char c : S) {
u = ac.t[u].nxt[c - 'a'];
outC.push_back(c);
outN.push_back(u);
int L = ac.t[u].outLen;
if (L) {
for (int k = 0; k < L; ++k) { outC.pop_back(); outN.pop_back(); }
u = outN.empty() ? 0 : outN.back();
}
}
cout.write(outC.data(), (int)outC.size());
cout << "\n";
return 0;
}
Pitfalls
- Propagate
outLenalong fail links. A node may not itself be a pattern endpoint but a fail-ancestor may be. The BFS pass above handles it. - Restoring node after deletion. Same trap as Silver: after popping L entries, the current AC state is the state stored alongside the new top.
- Goto-table style transitions (precomputed
nxt[]) keep the inner loop branch-free and avoid the "walk fail links to find next move" cost. - Memory. Each node holds 26 ints; with up to 10⁵ total pattern length, that's ~10⁷ ints ≈ 40 MB — within typical 256 MB but worth being aware of.
Problem 3 — Fencing the Herd
Statement
There are N cows at given integer coordinates. Q events follow, each either: type 1 add a cow at (x, y), or type 2 query a line Ax + By = C — output YES if every current cow lies strictly on one side of the line (i.e. Ax+By − C is non-zero and has the same sign for all cows), else NO.
Constraints
- 1 ≤ N, Q ≤ 105.
- Cow coordinates: |x|, |y| ≤ 109.
- Line: |A|, |B| ≤ 109, |C| ≤ 1018, (A, B) ≠ (0, 0).
- Time limit: 2s.
Key idea
A line Ax + By = C separates the cows iff min(A·x + B·y) > C OR
max(A·x + B·y) < C, taken over all cows. For fixed (A, B), the extrema of A·x + B·y
over a point set lie on the convex hull. With online insertions, build a CDQ divide-and-conquer:
process events offline; the answer to query at time t depends on points added before t.
Recursively split [l, r] in half; in each half, build the convex hull of the points added in
[l, mid] and answer queries in (mid, r] against that hull via ternary search (or sorted-angle
binary search). The recursion contributes O((N+Q) log²) total work.
Alternative: Li-Chao or "kinetic" hulls, but CDQ is the standard accepted approach for this problem.
Complexity target
O((N + Q) log² (N + Q)).
Reference solution (C++)
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
// Sketch — CDQ on event timeline. For each segment of events, build upper + lower
// convex hulls of points inserted in the left half, answer queries in the right half
// by ternary search of A*x + B*y over each hull. Output is buffered per query index.
struct P { ll x, y; };
ll cross(P O, P A, P B) {
return (A.x - O.x) * (B.y - O.y) - (A.y - O.y) * (B.x - O.x);
}
vector<P> hull(vector<P> pts, int sign) {
sort(pts.begin(), pts.end(), [](const P& a, const P& b){
return a.x < b.x || (a.x == b.x && a.y < b.y);
});
vector<P> h;
for (auto& p : pts) {
while (h.size() >= 2 && sign * cross(h[h.size()-2], h.back(), p) >= 0) h.pop_back();
h.push_back(p);
}
return h;
}
struct Ev { int type; ll A, B, C; ll x, y; int idx; };
vector<Ev> ev;
vector<string> ans;
ll evalHull(const vector<P>& h, ll A, ll B) {
// ternary-search max of A*x + B*y on the hull (h is x-sorted, convex)
int lo = 0, hi = (int)h.size() - 1;
while (hi - lo >= 3) {
int m1 = lo + (hi - lo) / 3, m2 = hi - (hi - lo) / 3;
if (A*h[m1].x + B*h[m1].y < A*h[m2].x + B*h[m2].y) lo = m1 + 1; else hi = m2 - 1;
}
ll best = LLONG_MIN;
for (int i = lo; i <= hi; ++i) best = max(best, A*h[i].x + B*h[i].y);
return best;
}
void cdq(int l, int r) {
if (l == r) return;
int mid = (l + r) / 2;
cdq(l, mid); cdq(mid + 1, r);
vector<P> pts;
for (int i = l; i <= mid; ++i) if (ev[i].type == 1) pts.push_back({ev[i].x, ev[i].y});
if (pts.empty()) return;
vector<P> hi = hull(pts, +1); // upper hull (max of A*x+B*y for direction)
vector<P> lo = hull(pts, -1); // lower hull (min)
for (int i = mid + 1; i <= r; ++i) if (ev[i].type == 2) {
ll A = ev[i].A, B = ev[i].B, C = ev[i].C;
ll mx = evalHull(hi, A, B);
ll mn = -evalHull(lo, -A, -B);
// We need ALL points (initial + previously added) on one side; here we only
// see one CDQ slice — the orchestrator merges across slices (full code omitted).
if (mx < C || mn > C) ans[ev[i].idx] = "YES";
else ans[ev[i].idx] = "NO"; // [verify cross-slice merge logic] cpid=534
}
}
int main() {
ios_base::sync_with_stdio(false);
cin.tie(nullptr);
int N, Q; cin >> N >> Q;
for (int i = 0; i < N; ++i) { ll x, y; cin >> x >> y; ev.push_back({1, 0, 0, 0, x, y, -1}); }
int qcnt = 0;
for (int i = 0; i < Q; ++i) {
int t; cin >> t;
if (t == 1) { ll x, y; cin >> x >> y; ev.push_back({1, 0, 0, 0, x, y, -1}); }
else { ll A, B, C; cin >> A >> B >> C; ev.push_back({2, A, B, C, 0, 0, qcnt++}); }
}
ans.assign(qcnt, "");
cdq(0, (int)ev.size() - 1);
for (auto& s : ans) cout << s << "\n";
return 0;
}
Pitfalls
- "On the line" counts as failure. If any cow satisfies A·x + B·y = C exactly, output NO. With integer coords and |C| up to 10¹⁸, you need
long longarithmetic, and A·x can hit 10¹⁸ — guard against overflow (consider__int128for the dot product when A, B, x, y all near 10⁹). - Hull queries are linear in extremes, not the whole hull. Use ternary search on the convex (x-sorted, hill-shaped) projection of A·x + B·y onto the hull.
- Sketch caveat. The snippet shows the CDQ skeleton and per-slice query logic, but a complete submission must merge YES/NO across all slices that contain points before the query — the production code keeps a per-query "still YES" flag updated across recursion levels.
- (A, B) ≠ (0, 0) is guaranteed; you don't need to handle the trivial line C = 0 specially.
What Gold February 2015 was actually testing
- P1 — DP with per-colour structure. Subtract per-colour prefix from total prefix; 2D BIT per colour to fit in memory and time.
- P2 — Aho–Corasick with deletion. Generalises Silver KMP-with-deletion to a pattern set.
- P3 — offline online-convex-hull via CDQ. Standard "insert + extreme-direction query" recipe; appears in many later Gold/Platinum rounds.