Example
sph_dam_break_2d
Grid-based SPH dam-break with parallel particle passes.
// ============================================================================
// 2D SPH Dam-Break Simulation (Optimized)
//
// Same physics model as sph_dam_break_2d.tgf:
// - Poly6 density kernel
// - Spiky gradient for pressure
// - Spiky viscosity Laplacian
// - Equation of state: p = k * (rho - rho0)
// - Explicit Euler integration
// - Inelastic reflective boundary conditions
//
// Main optimization:
// - compact per-cell bins instead of linked lists
// - particles reordered by cell every step for better locality
// ============================================================================
function clamp(real64 x, real64 lo, real64 hi) -> real64 {
if (x < lo) {
return lo
}
if (x > hi) {
return hi
}
return x
}
program main {
use array: full_int64
string out_dir = "sph_dam_break_2d_optimized_parallel_output"
ensure_dir(out_dir)
// Same baseline parameters as the original example.
int64 n = 1200
real64 h = 5.0
real64 edge = 5.0
real64 width = 400.0
real64 height = 400.0
real64 m = 1.0
real64 rho0 = 1.0
real64 c0 = 10.0
real64 gamma = 7.0
real64 dt = 0.05
real64 k = 20.0
real64 g = 0.2
int64 max_steps = 10000
int64 output_every = 50
int64 progress_every = 100
int64 quiet_steps = 0
int64 quiet_steps_needed = 200
real64 visc = 0.5
real64 eps = 0.1
real64 bdry_reflec = 0.5
real64 bdry_proj_dist = 0.0
real64 steady_ke_per_particle_tol = 0.0002
real64 x_min = edge
real64 y_min = edge
real64 x_max = width - edge
real64 y_max = height - edge
int64 init_nx = 20
int64 init_ny = 60
int64 grid_nx = int64((x_max - x_min) / h) + 1
int64 grid_ny = int64((y_max - y_min) / h) + 1
int64 num_cells = grid_nx * grid_ny
// Fluid state.
real64 x[n], y[n], u[n], v[n]
real64 accel_x[n], accel_y[n], rho[n], p[n]
// Cell binning and reorder scratch.
int64 particle_cell[n], particle_order[n]
int64 cell_counts[num_cells], cell_starts[num_cells], cell_offsets[num_cells]
real64 x_tmp[n], y_tmp[n], u_tmp[n], v_tmp[n]
real64 accel_x_tmp[n], accel_y_tmp[n], rho_tmp[n], p_tmp[n]
// Indices and scratch variables.
int64 step = 0, i = 0, ix = 0, iy = 0
int64 cell = 0, cx = 0, cy = 0, ncx = 0, ncy = 0
int64 sx0 = 0, sx1 = 0, sy0 = 0, sy1 = 0
int64 start = 0, count = 0, pos = 0, j = 0, running = 0
real64 dx = 0.0, dy = 0.0, rij = 0.0, r2 = 0.0, t = 0.0
real64 grad_fac = 0.0, lapv = 0.0
real64 grad_px = 0.0, grad_py = 0.0
real64 visc_x = 0.0, visc_y = 0.0
real64 rho_i = 0.0, avg_rho = 0.0, max_speed = 0.0, speed = 0.0
real64 kinetic_energy = 0.0, ke_per_particle = 0.0
real64 bx0 = 0.0, bx1 = 0.0, by0 = 0.0, by1 = 0.0
real64 jitter_x = 0.0, jitter_y = 0.0
real64 h2 = 0.0, poly6_coeff = 0.0, spiky_grad_coeff = 0.0, visc_lap_coeff = 0.0
string csv = "", frame_path = ""
h2 = h * h
poly6_coeff = 315.0 / (64.0 * 3.141592653589793 * h * h * h * h * h * h * h * h * h)
spiky_grad_coeff = -45.0 / (3.141592653589793 * h * h * h * h * h * h)
visc_lap_coeff = 45.0 / (3.141592653589793 * h * h * h * h * h * h)
// Initial left-column dam-break pack with random jitter.
do iy = 0, init_ny - 1 {
do ix = 0, init_nx - 1 {
i = ix + init_nx * iy
if (i < n) {
jitter_x = rand()
jitter_y = rand()
x[i] = x_min + h * real64(ix) + h * eps * jitter_x
y[i] = y_min + h * real64(iy) + h * eps * jitter_y
u[i] = 0.0
v[i] = 0.0
accel_x[i] = 0.0
accel_y[i] = 0.0
rho[i] = rho0
p[i] = 0.0
}
}
}
string meta = "max_steps=" + to_string(max_steps) + "\n"
meta += "output_every=" + to_string(output_every) + "\n"
meta += "n=" + to_string(n) + "\n"
meta += "h=" + to_string(h) + "\n"
meta += "dt=" + to_string(dt) + "\n"
meta += "steady_ke_per_particle_tol=" + to_string(steady_ke_per_particle_tol) + "\n"
meta += "quiet_steps_needed=" + to_string(quiet_steps_needed) + "\n"
meta += "x_min=" + to_string(x_min) + "\n"
meta += "x_max=" + to_string(x_max) + "\n"
meta += "y_min=" + to_string(y_min) + "\n"
meta += "y_max=" + to_string(y_max) + "\n"
write_file(out_dir + "/meta.txt", meta)
print("SPH 2D dam break optimized: n=", n, "h=", h, "dt=", dt, "steady_ke=", steady_ke_per_particle_tol)
step = 0
while (step <= max_steps && quiet_steps < quiet_steps_needed) {
// Bin particles into compact contiguous cell spans.
cell_counts = full_int64(num_cells, 0)
do i = 0, n - 1 {
cx = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
cy = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
cell = cx + cy * grid_nx
particle_cell[i] = cell
cell_counts[cell] += 1
}
running = 0
do cell = 0, num_cells - 1 {
cell_starts[cell] = running
cell_offsets[cell] = running
running += cell_counts[cell]
}
do i = 0, n - 1 {
cell = particle_cell[i]
particle_order[cell_offsets[cell]] = i
cell_offsets[cell] += 1
}
// Reorder by cell so neighbor cell spans are contiguous in memory.
do pos = 0, n - 1 {
i = particle_order[pos]
x_tmp[pos] = x[i]
y_tmp[pos] = y[i]
u_tmp[pos] = u[i]
v_tmp[pos] = v[i]
accel_x_tmp[pos] = accel_x[i]
accel_y_tmp[pos] = accel_y[i]
rho_tmp[pos] = rho[i]
p_tmp[pos] = p[i]
}
x = x_tmp
y = y_tmp
u = u_tmp
v = v_tmp
accel_x = accel_x_tmp
accel_y = accel_y_tmp
rho = rho_tmp
p = p_tmp
// Density
do parallel i = 0, n - 1 {
real64 rho_local = 0.0
int64 cx_local = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
int64 cy_local = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
int64 sx0_local = max(0, cx_local - 1)
int64 sx1_local = min(grid_nx - 1, cx_local + 1)
int64 sy0_local = max(0, cy_local - 1)
int64 sy1_local = min(grid_ny - 1, cy_local + 1)
int64 ncy_local = 0
int64 ncx_local = 0
int64 cell_local = 0
int64 start_local = 0
int64 count_local = 0
int64 pos_local = 0
int64 j_local = 0
real64 dx_local = 0.0
real64 dy_local = 0.0
real64 r2_local = 0.0
real64 t_local = 0.0
do ncy_local = sy0_local, sy1_local {
do ncx_local = sx0_local, sx1_local {
cell_local = ncx_local + ncy_local * grid_nx
start_local = cell_starts[cell_local]
count_local = cell_counts[cell_local]
do pos_local = 0, count_local - 1 {
j_local = start_local + pos_local
dx_local = x[i] - x[j_local]
dy_local = y[i] - y[j_local]
r2_local = dx_local * dx_local + dy_local * dy_local
if (r2_local <= h2) {
t_local = h2 - r2_local
rho_local += m * poly6_coeff * t_local * t_local * t_local
}
}
}
}
rho[i] = rho_local
}
// Pressure EOS
do parallel i = 0, n - 1 {
p[i] = k * (rho[i] - rho0)
}
// Acceleration
do parallel i = 0, n - 1 {
real64 ax_local_acc = 0.0
real64 ay_local_acc = -g
int64 cx_local_acc = int64(clamp((x[i] - x_min) / h, 0.0, real64(grid_nx - 1)))
int64 cy_local_acc = int64(clamp((y[i] - y_min) / h, 0.0, real64(grid_ny - 1)))
int64 sx0_local_acc = max(0, cx_local_acc - 1)
int64 sx1_local_acc = min(grid_nx - 1, cx_local_acc + 1)
int64 sy0_local_acc = max(0, cy_local_acc - 1)
int64 sy1_local_acc = min(grid_ny - 1, cy_local_acc + 1)
int64 ncy_local_acc = 0
int64 ncx_local_acc = 0
int64 cell_local_acc = 0
int64 start_local_acc = 0
int64 count_local_acc = 0
int64 pos_local_acc = 0
int64 j_local_acc = 0
real64 dx_local_acc = 0.0
real64 dy_local_acc = 0.0
real64 rij_local_acc = 0.0
real64 r2_local_acc = 0.0
real64 grad_fac_local_acc = 0.0
real64 lapv_local_acc = 0.0
real64 grad_px_local_acc = 0.0
real64 grad_py_local_acc = 0.0
real64 visc_x_local_acc = 0.0
real64 visc_y_local_acc = 0.0
do ncy_local_acc = sy0_local_acc, sy1_local_acc {
do ncx_local_acc = sx0_local_acc, sx1_local_acc {
cell_local_acc = ncx_local_acc + ncy_local_acc * grid_nx
start_local_acc = cell_starts[cell_local_acc]
count_local_acc = cell_counts[cell_local_acc]
do pos_local_acc = 0, count_local_acc - 1 {
j_local_acc = start_local_acc + pos_local_acc
if (j_local_acc != i) {
dx_local_acc = x[i] - x[j_local_acc]
dy_local_acc = y[i] - y[j_local_acc]
r2_local_acc = dx_local_acc * dx_local_acc + dy_local_acc * dy_local_acc
if (r2_local_acc > 0.0 && r2_local_acc <= h2) {
rij_local_acc = sqrt(r2_local_acc)
grad_fac_local_acc = spiky_grad_coeff * (h - rij_local_acc) * (h - rij_local_acc) / rij_local_acc
grad_px_local_acc = m * (p[i] + p[j_local_acc]) * grad_fac_local_acc * dx_local_acc / (2.0 * rho[i] * rho[j_local_acc])
grad_py_local_acc = m * (p[i] + p[j_local_acc]) * grad_fac_local_acc * dy_local_acc / (2.0 * rho[i] * rho[j_local_acc])
lapv_local_acc = visc_lap_coeff * (h - rij_local_acc)
visc_x_local_acc = m * visc * (u[j_local_acc] - u[i]) * lapv_local_acc / (rho[i] * rho[j_local_acc])
visc_y_local_acc = m * visc * (v[j_local_acc] - v[i]) * lapv_local_acc / (rho[i] * rho[j_local_acc])
ax_local_acc += grad_px_local_acc + visc_x_local_acc
ay_local_acc += grad_py_local_acc + visc_y_local_acc
}
}
}
}
}
accel_x[i] = ax_local_acc
accel_y[i] = ay_local_acc
}
// Explicit Euler
do parallel i = 0, n - 1 {
u[i] += dt * accel_x[i]
v[i] += dt * accel_y[i]
x[i] += dt * u[i]
y[i] += dt * v[i]
}
// Same reflective walls as the original example.
do parallel i = 0, n - 1 {
if (x[i] < x_min) {
u[i] *= -bdry_reflec
x[i] = x_min + bdry_proj_dist
} else if (x[i] > x_max) {
u[i] *= -bdry_reflec
x[i] = x_max - bdry_proj_dist
}
if (y[i] < y_min) {
v[i] *= -bdry_reflec
y[i] = y_min + bdry_proj_dist
} else if (y[i] > y_max) {
v[i] *= -bdry_reflec
y[i] = y_max - bdry_proj_dist
}
}
max_speed = 0.0
kinetic_energy = 0.0
do i = 0, n - 1 {
speed = sqrt(u[i] * u[i] + v[i] * v[i])
max_speed = max(max_speed, speed)
kinetic_energy += 0.5 * m * (u[i] * u[i] + v[i] * v[i])
}
ke_per_particle = kinetic_energy / real64(n)
if (ke_per_particle < steady_ke_per_particle_tol) {
quiet_steps += 1
} else {
quiet_steps = 0
}
if (step % progress_every == 0) {
avg_rho = 0.0
do i = 0, n - 1 {
avg_rho += rho[i]
}
avg_rho = avg_rho / real64(n)
print("step", step, "/", max_steps, "avg_rho", avg_rho, "max_speed", max_speed, "ke_per_particle", ke_per_particle, "quiet", quiet_steps)
}
if (step % output_every == 0) {
csv = "x,y,vx,vy,rho,p\n"
do i = 0, n - 1 {
csv += to_string(x[i]) + "," + to_string(y[i]) + ","
csv += to_string(u[i]) + "," + to_string(v[i]) + ","
csv += to_string(rho[i]) + "," + to_string(p[i]) + "\n"
}
frame_path = out_dir + "/frame_" + to_string(step) + ".csv"
write_file(frame_path, csv)
}
step += 1
}
avg_rho = 0.0
max_speed = 0.0
bx0 = x[0]
bx1 = x[0]
by0 = y[0]
by1 = y[0]
do i = 0, n - 1 {
avg_rho += rho[i]
speed = sqrt(u[i] * u[i] + v[i] * v[i])
max_speed = max(max_speed, speed)
bx0 = min(bx0, x[i])
bx1 = max(bx1, x[i])
by0 = min(by0, y[i])
by1 = max(by1, y[i])
}
avg_rho = avg_rho / real64(n)
string summary = "particles=" + to_string(n) + "\n"
summary += "executed_steps=" + to_string(step) + "\n"
summary += "quiet_steps=" + to_string(quiet_steps) + "\n"
summary += "avg_rho=" + to_string(avg_rho) + "\n"
summary += "ke_per_particle=" + to_string(ke_per_particle) + "\n"
summary += "max_speed=" + to_string(max_speed) + "\n"
summary += "bounds=" + to_string(bx0) + "," + to_string(bx1) + "," + to_string(by0) + "," + to_string(by1) + "\n"
write_file(out_dir + "/summary.txt", summary)
print("Done:", n, "particles, avg_rho=", avg_rho, "bounds=", bx0, bx1, by0, by1)
}