Example
sph_dam_break_2d_udt

SPH dam-break organized around UDT-backed simulation state.
// ============================================================================
// 2D SPH Dam-Break Simulation (Parallel, UDT-backed)
//
// The hot loops still operate over arrays, but the simulation is organized
// around UDT-backed parameter and state blocks.
// ============================================================================

udt SimParams {
    int64 n
    real64 h
    real64 edge
    real64 width
    real64 height
    real64 m
    real64 rho0
    real64 c0
    real64 gamma
    real64 dt
    real64 k
    real64 g
    int64 max_steps
    int64 output_every
    int64 progress_every
    int64 quiet_steps_needed
    real64 visc
    real64 eps
    real64 bdry_reflec
    real64 bdry_proj_dist
    real64 steady_ke_per_particle_tol
    real64 x_min
    real64 y_min
    real64 x_max
    real64 y_max
    int64 init_nx
    int64 init_ny
    int64 grid_nx
    int64 grid_ny
    int64 num_cells
    real64 h2
    real64 poly6_coeff
    real64 spiky_grad_coeff
    real64 visc_lap_coeff
}

udt FluidState {
    real64[:] x
    real64[:] y
    real64[:] u
    real64[:] v
    real64[:] accel_x
    real64[:] accel_y
    real64[:] rho
    real64[:] p
}

udt GridScratch {
    int64[:] particle_cell
    int64[:] particle_order
    int64[:] cell_counts
    int64[:] cell_starts
    int64[:] cell_offsets
    real64[:] x_tmp
    real64[:] y_tmp
    real64[:] u_tmp
    real64[:] v_tmp
    real64[:] accel_x_tmp
    real64[:] accel_y_tmp
    real64[:] rho_tmp
    real64[:] p_tmp
}

function clamp(real64 x, real64 lo, real64 hi) -> real64 {
    if (x < lo) {
        return lo
    }
    if (x > hi) {
        return hi
    }
    return x
}

program main {
    use array: full_int64, full_real64

    string out_dir = "sph_dam_break_2d_udt_parallel_output"
    ensure_dir(out_dir)

    SimParams sim
    sim.n = 1200
    sim.h = 5.0
    sim.edge = 5.0
    sim.width = 400.0
    sim.height = 400.0
    sim.m = 1.0
    sim.rho0 = 1.0
    sim.c0 = 10.0
    sim.gamma = 7.0
    sim.dt = 0.05
    sim.k = 20.0
    sim.g = 0.2
    sim.max_steps = 10000
    sim.output_every = 50
    sim.progress_every = 100
    sim.quiet_steps_needed = 200
    sim.visc = 0.5
    sim.eps = 0.1
    sim.bdry_reflec = 0.5
    sim.bdry_proj_dist = 0.0
    sim.steady_ke_per_particle_tol = 0.0002
    sim.x_min = sim.edge
    sim.y_min = sim.edge
    sim.x_max = sim.width - sim.edge
    sim.y_max = sim.height - sim.edge
    sim.init_nx = 20
    sim.init_ny = 60
    sim.grid_nx = int64((sim.x_max - sim.x_min) / sim.h) + 1
    sim.grid_ny = int64((sim.y_max - sim.y_min) / sim.h) + 1
    sim.num_cells = sim.grid_nx * sim.grid_ny
    sim.h2 = sim.h * sim.h
    sim.poly6_coeff = 315.0 / (64.0 * 3.141592653589793 * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h)
    sim.spiky_grad_coeff = -45.0 / (3.141592653589793 * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h)
    sim.visc_lap_coeff = 45.0 / (3.141592653589793 * sim.h * sim.h * sim.h * sim.h * sim.h * sim.h)

    FluidState state
    state.x = full_real64(sim.n, 0.0)
    state.y = full_real64(sim.n, 0.0)
    state.u = full_real64(sim.n, 0.0)
    state.v = full_real64(sim.n, 0.0)
    state.accel_x = full_real64(sim.n, 0.0)
    state.accel_y = full_real64(sim.n, 0.0)
    state.rho = full_real64(sim.n, sim.rho0)
    state.p = full_real64(sim.n, 0.0)

    GridScratch grid
    grid.particle_cell = full_int64(sim.n, 0)
    grid.particle_order = full_int64(sim.n, 0)
    grid.cell_counts = full_int64(sim.num_cells, 0)
    grid.cell_starts = full_int64(sim.num_cells, 0)
    grid.cell_offsets = full_int64(sim.num_cells, 0)
    grid.x_tmp = full_real64(sim.n, 0.0)
    grid.y_tmp = full_real64(sim.n, 0.0)
    grid.u_tmp = full_real64(sim.n, 0.0)
    grid.v_tmp = full_real64(sim.n, 0.0)
    grid.accel_x_tmp = full_real64(sim.n, 0.0)
    grid.accel_y_tmp = full_real64(sim.n, 0.0)
    grid.rho_tmp = full_real64(sim.n, sim.rho0)
    grid.p_tmp = full_real64(sim.n, 0.0)

    int64 quiet_steps = 0
    int64 step = 0
    int64 i = 0, ix = 0, iy = 0
    int64 cell = 0, cx = 0, cy = 0, ncx = 0, ncy = 0
    int64 sx0 = 0, sx1 = 0, sy0 = 0, sy1 = 0
    int64 start = 0, count = 0, pos = 0, j = 0, running = 0

    real64 dx = 0.0, dy = 0.0, rij = 0.0, r2 = 0.0, t = 0.0
    real64 grad_fac = 0.0, lapv = 0.0
    real64 grad_px = 0.0, grad_py = 0.0
    real64 visc_x = 0.0, visc_y = 0.0
    real64 rho_i = 0.0, avg_rho = 0.0, max_speed = 0.0, speed = 0.0
    real64 kinetic_energy = 0.0, ke_per_particle = 0.0
    real64 bx0 = 0.0, bx1 = 0.0, by0 = 0.0, by1 = 0.0
    real64 jitter_x = 0.0, jitter_y = 0.0

    string csv = ""
    string frame_path = ""

    do iy = 0, sim.init_ny - 1 {
        do ix = 0, sim.init_nx - 1 {
            i = ix + sim.init_nx * iy
            if (i < sim.n) {
                jitter_x = rand()
                jitter_y = rand()
                state.x[i] = sim.x_min + sim.h * real64(ix) + sim.h * sim.eps * jitter_x
                state.y[i] = sim.y_min + sim.h * real64(iy) + sim.h * sim.eps * jitter_y
                state.u[i] = 0.0
                state.v[i] = 0.0
                state.accel_x[i] = 0.0
                state.accel_y[i] = 0.0
                state.rho[i] = sim.rho0
                state.p[i] = 0.0
            }
        }
    }

    string meta = "max_steps=" + to_string(sim.max_steps) + "\n"
    meta += "output_every=" + to_string(sim.output_every) + "\n"
    meta += "n=" + to_string(sim.n) + "\n"
    meta += "h=" + to_string(sim.h) + "\n"
    meta += "dt=" + to_string(sim.dt) + "\n"
    meta += "steady_ke_per_particle_tol=" + to_string(sim.steady_ke_per_particle_tol) + "\n"
    meta += "quiet_steps_needed=" + to_string(sim.quiet_steps_needed) + "\n"
    meta += "x_min=" + to_string(sim.x_min) + "\n"
    meta += "x_max=" + to_string(sim.x_max) + "\n"
    meta += "y_min=" + to_string(sim.y_min) + "\n"
    meta += "y_max=" + to_string(sim.y_max) + "\n"
    write_file(out_dir + "/meta.txt", meta)

    print("SPH 2D dam break UDT parallel: n=", sim.n, "h=", sim.h, "dt=", sim.dt, "steady_ke=", sim.steady_ke_per_particle_tol)

    step = 0
    while (step <= sim.max_steps && quiet_steps < sim.quiet_steps_needed) {
        grid.cell_counts = full_int64(sim.num_cells, 0)
        do i = 0, sim.n - 1 {
            cx = int64(clamp((state.x[i] - sim.x_min) / sim.h, 0.0, real64(sim.grid_nx - 1)))
            cy = int64(clamp((state.y[i] - sim.y_min) / sim.h, 0.0, real64(sim.grid_ny - 1)))
            cell = cx + cy * sim.grid_nx
            grid.particle_cell[i] = cell
            grid.cell_counts[cell] += 1
        }

        running = 0
        do cell = 0, sim.num_cells - 1 {
            grid.cell_starts[cell] = running
            grid.cell_offsets[cell] = running
            running += grid.cell_counts[cell]
        }

        do i = 0, sim.n - 1 {
            cell = grid.particle_cell[i]
            grid.particle_order[grid.cell_offsets[cell]] = i
            grid.cell_offsets[cell] += 1
        }

        do pos = 0, sim.n - 1 {
            i = grid.particle_order[pos]
            grid.x_tmp[pos] = state.x[i]
            grid.y_tmp[pos] = state.y[i]
            grid.u_tmp[pos] = state.u[i]
            grid.v_tmp[pos] = state.v[i]
            grid.accel_x_tmp[pos] = state.accel_x[i]
            grid.accel_y_tmp[pos] = state.accel_y[i]
            grid.rho_tmp[pos] = state.rho[i]
            grid.p_tmp[pos] = state.p[i]
        }
        state.x = grid.x_tmp
        state.y = grid.y_tmp
        state.u = grid.u_tmp
        state.v = grid.v_tmp
        state.accel_x = grid.accel_x_tmp
        state.accel_y = grid.accel_y_tmp
        state.rho = grid.rho_tmp
        state.p = grid.p_tmp

        do parallel i = 0, sim.n - 1 {
            real64 rho_local = 0.0
            int64 cx_local = int64(clamp((state.x[i] - sim.x_min) / sim.h, 0.0, real64(sim.grid_nx - 1)))
            int64 cy_local = int64(clamp((state.y[i] - sim.y_min) / sim.h, 0.0, real64(sim.grid_ny - 1)))
            int64 sx0_local = max(0, cx_local - 1)
            int64 sx1_local = min(sim.grid_nx - 1, cx_local + 1)
            int64 sy0_local = max(0, cy_local - 1)
            int64 sy1_local = min(sim.grid_ny - 1, cy_local + 1)
            int64 ncy_local = 0
            int64 ncx_local = 0
            int64 cell_local = 0
            int64 start_local = 0
            int64 count_local = 0
            int64 pos_local = 0
            int64 j_local = 0
            real64 dx_local = 0.0
            real64 dy_local = 0.0
            real64 r2_local = 0.0
            real64 t_local = 0.0
            do ncy_local = sy0_local, sy1_local {
                do ncx_local = sx0_local, sx1_local {
                    cell_local = ncx_local + ncy_local * sim.grid_nx
                    start_local = grid.cell_starts[cell_local]
                    count_local = grid.cell_counts[cell_local]
                    do pos_local = 0, count_local - 1 {
                        j_local = start_local + pos_local
                        dx_local = state.x[i] - state.x[j_local]
                        dy_local = state.y[i] - state.y[j_local]
                        r2_local = dx_local * dx_local + dy_local * dy_local
                        if (r2_local <= sim.h2) {
                            t_local = sim.h2 - r2_local
                            rho_local += sim.m * sim.poly6_coeff * t_local * t_local * t_local
                        }
                    }
                }
            }
            state.rho[i] = rho_local
        }

        do parallel i = 0, sim.n - 1 {
            state.p[i] = sim.k * (state.rho[i] - sim.rho0)
        }

        do parallel i = 0, sim.n - 1 {
            real64 ax_local_acc = 0.0
            real64 ay_local_acc = -sim.g
            int64 cx_local_acc = int64(clamp((state.x[i] - sim.x_min) / sim.h, 0.0, real64(sim.grid_nx - 1)))
            int64 cy_local_acc = int64(clamp((state.y[i] - sim.y_min) / sim.h, 0.0, real64(sim.grid_ny - 1)))
            int64 sx0_local_acc = max(0, cx_local_acc - 1)
            int64 sx1_local_acc = min(sim.grid_nx - 1, cx_local_acc + 1)
            int64 sy0_local_acc = max(0, cy_local_acc - 1)
            int64 sy1_local_acc = min(sim.grid_ny - 1, cy_local_acc + 1)
            int64 ncy_local_acc = 0
            int64 ncx_local_acc = 0
            int64 cell_local_acc = 0
            int64 start_local_acc = 0
            int64 count_local_acc = 0
            int64 pos_local_acc = 0
            int64 j_local_acc = 0
            real64 dx_local_acc = 0.0
            real64 dy_local_acc = 0.0
            real64 rij_local_acc = 0.0
            real64 r2_local_acc = 0.0
            real64 grad_fac_local_acc = 0.0
            real64 lapv_local_acc = 0.0
            real64 grad_px_local_acc = 0.0
            real64 grad_py_local_acc = 0.0
            real64 visc_x_local_acc = 0.0
            real64 visc_y_local_acc = 0.0

            do ncy_local_acc = sy0_local_acc, sy1_local_acc {
                do ncx_local_acc = sx0_local_acc, sx1_local_acc {
                    cell_local_acc = ncx_local_acc + ncy_local_acc * sim.grid_nx
                    start_local_acc = grid.cell_starts[cell_local_acc]
                    count_local_acc = grid.cell_counts[cell_local_acc]
                    do pos_local_acc = 0, count_local_acc - 1 {
                        j_local_acc = start_local_acc + pos_local_acc
                        if (j_local_acc != i) {
                            dx_local_acc = state.x[i] - state.x[j_local_acc]
                            dy_local_acc = state.y[i] - state.y[j_local_acc]
                            r2_local_acc = dx_local_acc * dx_local_acc + dy_local_acc * dy_local_acc
                            if (r2_local_acc > 0.0 && r2_local_acc <= sim.h2) {
                                rij_local_acc = sqrt(r2_local_acc)
                                grad_fac_local_acc = sim.spiky_grad_coeff * (sim.h - rij_local_acc) * (sim.h - rij_local_acc) / rij_local_acc
                                grad_px_local_acc = sim.m * (state.p[i] + state.p[j_local_acc]) * grad_fac_local_acc * dx_local_acc / (2.0 * state.rho[i] * state.rho[j_local_acc])
                                grad_py_local_acc = sim.m * (state.p[i] + state.p[j_local_acc]) * grad_fac_local_acc * dy_local_acc / (2.0 * state.rho[i] * state.rho[j_local_acc])
                                lapv_local_acc = sim.visc_lap_coeff * (sim.h - rij_local_acc)
                                visc_x_local_acc = sim.m * sim.visc * (state.u[j_local_acc] - state.u[i]) * lapv_local_acc / (state.rho[i] * state.rho[j_local_acc])
                                visc_y_local_acc = sim.m * sim.visc * (state.v[j_local_acc] - state.v[i]) * lapv_local_acc / (state.rho[i] * state.rho[j_local_acc])
                                ax_local_acc += grad_px_local_acc + visc_x_local_acc
                                ay_local_acc += grad_py_local_acc + visc_y_local_acc
                            }
                        }
                    }
                }
            }
            state.accel_x[i] = ax_local_acc
            state.accel_y[i] = ay_local_acc
        }

        do parallel i = 0, sim.n - 1 {
            state.u[i] += sim.dt * state.accel_x[i]
            state.v[i] += sim.dt * state.accel_y[i]
            state.x[i] += sim.dt * state.u[i]
            state.y[i] += sim.dt * state.v[i]
        }

        do parallel i = 0, sim.n - 1 {
            if (state.x[i] < sim.x_min + sim.bdry_proj_dist) {
                state.x[i] = sim.x_min + sim.bdry_proj_dist
                state.u[i] = -sim.bdry_reflec * state.u[i]
            }
            if (state.x[i] > sim.x_max - sim.bdry_proj_dist) {
                state.x[i] = sim.x_max - sim.bdry_proj_dist
                state.u[i] = -sim.bdry_reflec * state.u[i]
            }
            if (state.y[i] < sim.y_min + sim.bdry_proj_dist) {
                state.y[i] = sim.y_min + sim.bdry_proj_dist
                state.v[i] = -sim.bdry_reflec * state.v[i]
            }
            if (state.y[i] > sim.y_max - sim.bdry_proj_dist) {
                state.y[i] = sim.y_max - sim.bdry_proj_dist
                state.v[i] = -sim.bdry_reflec * state.v[i]
            }
        }

        avg_rho = 0.0
        max_speed = 0.0
        kinetic_energy = 0.0
        bx0 = state.x[0]
        bx1 = state.x[0]
        by0 = state.y[0]
        by1 = state.y[0]
        do i = 0, sim.n - 1 {
            avg_rho += state.rho[i]
            speed = sqrt(state.u[i] * state.u[i] + state.v[i] * state.v[i])
            if (speed > max_speed) {
                max_speed = speed
            }
            kinetic_energy += 0.5 * sim.m * (state.u[i] * state.u[i] + state.v[i] * state.v[i])
            if (state.x[i] < bx0) {
                bx0 = state.x[i]
            }
            if (state.x[i] > bx1) {
                bx1 = state.x[i]
            }
            if (state.y[i] < by0) {
                by0 = state.y[i]
            }
            if (state.y[i] > by1) {
                by1 = state.y[i]
            }
        }
        avg_rho /= real64(sim.n)
        ke_per_particle = kinetic_energy / real64(sim.n)

        if (ke_per_particle < sim.steady_ke_per_particle_tol) {
            quiet_steps += 1
        } else {
            quiet_steps = 0
        }

        if (step % sim.progress_every == 0) {
            print("step", step, "/", sim.max_steps, "avg_rho", avg_rho, "max_speed", max_speed, "ke_per_particle", ke_per_particle, "quiet", quiet_steps)
        }

        if (step % sim.output_every == 0) {
            csv = "x,y,u,v,rho,p\n"
            do i = 0, sim.n - 1 {
                csv += to_string(state.x[i]) + "," + to_string(state.y[i]) + "," + to_string(state.u[i]) + "," + to_string(state.v[i]) + "," + to_string(state.rho[i]) + "," + to_string(state.p[i]) + "\n"
            }
            frame_path = out_dir + "/frame_" + to_string(step) + ".csv"
            write_file(frame_path, csv)
        }

        step += 1
    }

    string summary = "executed_steps=" + to_string(step) + "\n"
    summary += "quiet_steps=" + to_string(quiet_steps) + "\n"
    summary += "avg_rho=" + to_string(avg_rho) + "\n"
    summary += "max_speed=" + to_string(max_speed) + "\n"
    summary += "ke_per_particle=" + to_string(ke_per_particle) + "\n"
    summary += "min_x=" + to_string(bx0) + "\n"
    summary += "max_x=" + to_string(bx1) + "\n"
    summary += "min_y=" + to_string(by0) + "\n"
    summary += "max_y=" + to_string(by1) + "\n"
    write_file(out_dir + "/summary.txt", summary)

    print("Done:", sim.n, "particles, avg_rho=", avg_rho, "bounds=", bx0, bx1, by0, by1)
}