Example
sph_dam_break_2d

Grid-based SPH dam-break with parallel particle passes.
// ============================================================================
// 2D SPH Dam-Break Simulation (Optimized)
//
// Same physics model as sph_dam_break_2d.tgf:
//   - Poly6 density kernel
//   - Spiky gradient for pressure
//   - Spiky viscosity Laplacian
//   - Equation of state: p = k * (rho - rho0)
//   - Explicit Euler integration
//   - Inelastic reflective boundary conditions
//
// Main optimization:
//   - compact per-cell bins instead of linked lists
//   - particles reordered by cell every step for better locality
// ============================================================================

function clamp(real64 x, real64 lo, real64 hi) -> real64 {
    if (x < lo) {
        return lo
    }
    if (x > hi) {
        return hi
    }
    return x
}

program main {
    use array: full_int64

    string out_dir = "sph_dam_break_2d_optimized_parallel_output"
    ensure_dir(out_dir)

    // Same baseline parameters as the original example.
    int64 n = 1200
    real64 h = 5.0
    real64 edge = 5.0
    real64 width = 400.0
    real64 height = 400.0
    real64 m = 1.0
    real64 rho0 = 1.0
    real64 c0 = 10.0
    real64 gamma = 7.0
    real64 dt = 0.05
    real64 k = 20.0
    real64 g = 0.2
    int64 max_steps = 10000
    int64 output_every = 50
    int64 progress_every = 100
    int64 quiet_steps = 0
    int64 quiet_steps_needed = 200
    real64 visc = 0.5
    real64 eps = 0.1
    real64 bdry_reflec = 0.5
    real64 bdry_proj_dist = 0.0
    real64 steady_ke_per_particle_tol = 0.0002

    real64 x_min = edge
    real64 y_min = edge
    real64 x_max = width - edge
    real64 y_max = height - edge

    int64 init_nx = 20
    int64 init_ny = 60

    int64 grid_nx = int64((x_max - x_min) / h) + 1
    int64 grid_ny = int64((y_max - y_min) / h) + 1
    int64 num_cells = grid_nx * grid_ny

    // Fluid state.
    real64 x[n], y[n], u[n], v[n]
    real64 accel_x[n], accel_y[n], rho[n], p[n]

    // Cell binning and reorder scratch.
    int64 particle_cell[n], particle_order[n]
    int64 cell_counts[num_cells], cell_starts[num_cells], cell_offsets[num_cells]

    real64 x_tmp[n], y_tmp[n], u_tmp[n], v_tmp[n]
    real64 accel_x_tmp[n], accel_y_tmp[n], rho_tmp[n], p_tmp[n]

    // Indices and scratch variables.
    int64 step = 0, i = 0, ix = 0, iy = 0
    int64 cell = 0, cx = 0, cy = 0, ncx = 0, ncy = 0
    int64 sx0 = 0, sx1 = 0, sy0 = 0, sy1 = 0
    int64 start = 0, count = 0, pos = 0, j = 0, running = 0

    real64 dx = 0.0, dy = 0.0, rij = 0.0, r2 = 0.0, t = 0.0
    real64 grad_fac = 0.0, lapv = 0.0
    real64 grad_px = 0.0, grad_py = 0.0
    real64 visc_x = 0.0, visc_y = 0.0
    real64 rho_i = 0.0, avg_rho = 0.0, max_speed = 0.0, speed = 0.0
    real64 kinetic_energy = 0.0, ke_per_particle = 0.0
    real64 bx0 = 0.0, bx1 = 0.0, by0 = 0.0, by1 = 0.0
    real64 jitter_x = 0.0, jitter_y = 0.0
    real64 h2 = 0.0, poly6_coeff = 0.0, spiky_grad_coeff = 0.0, visc_lap_coeff = 0.0

    string csv = "", frame_path = ""

    h2 = h * h
    poly6_coeff = 315.0 / (64.0 * 3.141592653589793 * h * h * h * h * h * h * h * h * h)
    spiky_grad_coeff = -45.0 / (3.141592653589793 * h * h * h * h * h * h)
    visc_lap_coeff = 45.0 / (3.141592653589793 * h * h * h * h * h * h)

    // Initial left-column dam-break pack with random jitter.
    do iy = 0, init_ny - 1 {
        do ix = 0, init_nx - 1 {
            i = ix + init_nx * iy
            if (i < n) {
                jitter_x = rand()
                jitter_y = rand()
                x[i] = x_min + h * real64(ix) + h * eps * jitter_x
                y[i] = y_min + h * real64(iy) + h * eps * jitter_y
                u[i] = 0.0
                v[i] = 0.0
                accel_x[i] = 0.0
                accel_y[i] = 0.0
                rho[i] = rho0
                p[i] = 0.0
            }
        }
    }

    string meta = "max_steps=" + to_string(max_steps) + "\n"
    meta += "output_every=" + to_string(output_every) + "\n"
    meta += "n=" + to_string(n) + "\n"
    meta += "h=" + to_string(h) + "\n"
    meta += "dt=" + to_string(dt) + "\n"
    meta += "steady_ke_per_particle_tol=" + to_string(steady_ke_per_particle_tol) + "\n"
    meta += "quiet_steps_needed=" + to_string(quiet_steps_needed) + "\n"
    meta += "x_min=" + to_string(x_min) + "\n"
    meta += "x_max=" + to_string(x_max) + "\n"
    meta += "y_min=" + to_string(y_min) + "\n"
    meta += "y_max=" + to_string(y_max) + "\n"
    write_file(out_dir + "/meta.txt", meta)

    print("SPH 2D dam break optimized: n=", n, "h=", h, "dt=", dt, "steady_ke=", steady_ke_per_particle_tol)

    step = 0
    while (step <= max_steps && quiet_steps < quiet_steps_needed) {
        // Bin particles into compact contiguous cell spans.
        cell_counts = full_int64(num_cells, 0)
        do i = 0, n - 1 {
            cx = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
            cy = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
            cell = cx + cy * grid_nx
            particle_cell[i] = cell
            cell_counts[cell] += 1
        }

        running = 0
        do cell = 0, num_cells - 1 {
            cell_starts[cell] = running
            cell_offsets[cell] = running
            running += cell_counts[cell]
        }

        do i = 0, n - 1 {
            cell = particle_cell[i]
            particle_order[cell_offsets[cell]] = i
            cell_offsets[cell] += 1
        }

        // Reorder by cell so neighbor cell spans are contiguous in memory.
        do pos = 0, n - 1 {
            i = particle_order[pos]
            x_tmp[pos] = x[i]
            y_tmp[pos] = y[i]
            u_tmp[pos] = u[i]
            v_tmp[pos] = v[i]
            accel_x_tmp[pos] = accel_x[i]
            accel_y_tmp[pos] = accel_y[i]
            rho_tmp[pos] = rho[i]
            p_tmp[pos] = p[i]
        }
        x = x_tmp
        y = y_tmp
        u = u_tmp
        v = v_tmp
        accel_x = accel_x_tmp
        accel_y = accel_y_tmp
        rho = rho_tmp
        p = p_tmp

        // Density
        do parallel i = 0, n - 1 {
            real64 rho_local = 0.0
            int64 cx_local = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
            int64 cy_local = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
            int64 sx0_local = max(0, cx_local - 1)
            int64 sx1_local = min(grid_nx - 1, cx_local + 1)
            int64 sy0_local = max(0, cy_local - 1)
            int64 sy1_local = min(grid_ny - 1, cy_local + 1)
            int64 ncy_local = 0
            int64 ncx_local = 0
            int64 cell_local = 0
            int64 start_local = 0
            int64 count_local = 0
            int64 pos_local = 0
            int64 j_local = 0
            real64 dx_local = 0.0
            real64 dy_local = 0.0
            real64 r2_local = 0.0
            real64 t_local = 0.0
            do ncy_local = sy0_local, sy1_local {
                do ncx_local = sx0_local, sx1_local {
                    cell_local = ncx_local + ncy_local * grid_nx
                    start_local = cell_starts[cell_local]
                    count_local = cell_counts[cell_local]
                    do pos_local = 0, count_local - 1 {
                        j_local = start_local + pos_local
                        dx_local = x[i] - x[j_local]
                        dy_local = y[i] - y[j_local]
                        r2_local = dx_local * dx_local + dy_local * dy_local
                        if (r2_local <= h2) {
                            t_local = h2 - r2_local
                            rho_local += m * poly6_coeff * t_local * t_local * t_local
                        }
                    }
                }
            }
            rho[i] = rho_local
        }

        // Pressure EOS
        do parallel i = 0, n - 1 {
            p[i] = k * (rho[i] - rho0)
        }

        // Acceleration
        do parallel i = 0, n - 1 {
            real64 ax_local_acc = 0.0
            real64 ay_local_acc = -g
            int64 cx_local_acc = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
            int64 cy_local_acc = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
            int64 sx0_local_acc = max(0, cx_local_acc - 1)
            int64 sx1_local_acc = min(grid_nx - 1, cx_local_acc + 1)
            int64 sy0_local_acc = max(0, cy_local_acc - 1)
            int64 sy1_local_acc = min(grid_ny - 1, cy_local_acc + 1)
            int64 ncy_local_acc = 0
            int64 ncx_local_acc = 0
            int64 cell_local_acc = 0
            int64 start_local_acc = 0
            int64 count_local_acc = 0
            int64 pos_local_acc = 0
            int64 j_local_acc = 0
            real64 dx_local_acc = 0.0
            real64 dy_local_acc = 0.0
            real64 rij_local_acc = 0.0
            real64 r2_local_acc = 0.0
            real64 grad_fac_local_acc = 0.0
            real64 lapv_local_acc = 0.0
            real64 grad_px_local_acc = 0.0
            real64 grad_py_local_acc = 0.0
            real64 visc_x_local_acc = 0.0
            real64 visc_y_local_acc = 0.0

            do ncy_local_acc = sy0_local_acc, sy1_local_acc {
                do ncx_local_acc = sx0_local_acc, sx1_local_acc {
                    cell_local_acc = ncx_local_acc + ncy_local_acc * grid_nx
                    start_local_acc = cell_starts[cell_local_acc]
                    count_local_acc = cell_counts[cell_local_acc]
                    do pos_local_acc = 0, count_local_acc - 1 {
                        j_local_acc = start_local_acc + pos_local_acc
                        if (j_local_acc != i) {
                            dx_local_acc = x[i] - x[j_local_acc]
                            dy_local_acc = y[i] - y[j_local_acc]
                            r2_local_acc = dx_local_acc * dx_local_acc + dy_local_acc * dy_local_acc
                            if (r2_local_acc > 0.0 && r2_local_acc <= h2) {
                                rij_local_acc = sqrt(r2_local_acc)
                                grad_fac_local_acc = spiky_grad_coeff * (h - rij_local_acc) * (h - rij_local_acc) / rij_local_acc
                                grad_px_local_acc = m * (p[i] + p[j_local_acc]) * grad_fac_local_acc * dx_local_acc / (2.0 * rho[i] * rho[j_local_acc])
                                grad_py_local_acc = m * (p[i] + p[j_local_acc]) * grad_fac_local_acc * dy_local_acc / (2.0 * rho[i] * rho[j_local_acc])

                                lapv_local_acc = visc_lap_coeff * (h - rij_local_acc)
                                visc_x_local_acc = m * visc * (u[j_local_acc] - u[i]) * lapv_local_acc / (rho[i] * rho[j_local_acc])
                                visc_y_local_acc = m * visc * (v[j_local_acc] - v[i]) * lapv_local_acc / (rho[i] * rho[j_local_acc])

                                ax_local_acc += grad_px_local_acc + visc_x_local_acc
                                ay_local_acc += grad_py_local_acc + visc_y_local_acc
                            }
                        }
                    }
                }
            }
            accel_x[i] = ax_local_acc
            accel_y[i] = ay_local_acc
        }

        // Explicit Euler
        do parallel i = 0, n - 1 {
            u[i] += dt * accel_x[i]
            v[i] += dt * accel_y[i]
            x[i] += dt * u[i]
            y[i] += dt * v[i]
        }

        // Same reflective walls as the original example.
        do parallel i = 0, n - 1 {
            if (x[i] < x_min) {
                u[i] *= -bdry_reflec
                x[i] = x_min + bdry_proj_dist
            } else if (x[i] > x_max) {
                u[i] *= -bdry_reflec
                x[i] = x_max - bdry_proj_dist
            }

            if (y[i] < y_min) {
                v[i] *= -bdry_reflec
                y[i] = y_min + bdry_proj_dist
            } else if (y[i] > y_max) {
                v[i] *= -bdry_reflec
                y[i] = y_max - bdry_proj_dist
            }
        }

        max_speed = 0.0
        kinetic_energy = 0.0
        do i = 0, n - 1 {
            speed = sqrt(u[i] * u[i] + v[i] * v[i])
            max_speed = max(max_speed, speed)
            kinetic_energy += 0.5 * m * (u[i] * u[i] + v[i] * v[i])
        }
        ke_per_particle = kinetic_energy / real64(n)
        if (ke_per_particle < steady_ke_per_particle_tol) {
            quiet_steps += 1
        } else {
            quiet_steps = 0
        }

        if (step % progress_every == 0) {
            avg_rho = 0.0
            do i = 0, n - 1 {
                avg_rho += rho[i]
            }
            avg_rho = avg_rho / real64(n)
            print("step", step, "/", max_steps, "avg_rho", avg_rho, "max_speed", max_speed, "ke_per_particle", ke_per_particle, "quiet", quiet_steps)
        }

        if (step % output_every == 0) {
            csv = "x,y,vx,vy,rho,p\n"
            do i = 0, n - 1 {
                csv += to_string(x[i]) + "," + to_string(y[i]) + ","
                csv += to_string(u[i]) + "," + to_string(v[i]) + ","
                csv += to_string(rho[i]) + "," + to_string(p[i]) + "\n"
            }
            frame_path = out_dir + "/frame_" + to_string(step) + ".csv"
            write_file(frame_path, csv)
        }
        step += 1
    }

    avg_rho = 0.0
    max_speed = 0.0
    bx0 = x[0]
    bx1 = x[0]
    by0 = y[0]
    by1 = y[0]
    do i = 0, n - 1 {
        avg_rho += rho[i]
        speed = sqrt(u[i] * u[i] + v[i] * v[i])
        max_speed = max(max_speed, speed)
        bx0 = min(bx0, x[i])
        bx1 = max(bx1, x[i])
        by0 = min(by0, y[i])
        by1 = max(by1, y[i])
    }
    avg_rho = avg_rho / real64(n)

    string summary = "particles=" + to_string(n) + "\n"
    summary += "executed_steps=" + to_string(step) + "\n"
    summary += "quiet_steps=" + to_string(quiet_steps) + "\n"
    summary += "avg_rho=" + to_string(avg_rho) + "\n"
    summary += "ke_per_particle=" + to_string(ke_per_particle) + "\n"
    summary += "max_speed=" + to_string(max_speed) + "\n"
    summary += "bounds=" + to_string(bx0) + "," + to_string(bx1) + "," + to_string(by0) + "," + to_string(by1) + "\n"
    write_file(out_dir + "/summary.txt", summary)

    print("Done:", n, "particles, avg_rho=", avg_rho, "bounds=", bx0, bx1, by0, by1)
}