header("6. RAW CUDA KERNEL — MANDELBROT")
mandel = cp.RawKernel(r'''
extern "C" __global__
void mandel(float xmin, float xmax, float ymin, float ymax,
int W, int H, int max_iter, int* out) {
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix >= W || iy >= H) return;
float cx = xmin + (xmax - xmin) * ix / (W - 1);
float cy = ymin + (ymax - ymin) * iy / (H - 1);
float zx = 0.f, zy = 0.f;
int it = 0;
whereas (zx*zx + zy*zy < 4.f && it < max_iter) {
float t = zx*zx - zy*zy + cx;
zy = 2.f*zx*zy + cy;
zx = t; ++it;
}
out[iy*W + ix] = it;
}
''', 'mandel')
W, H, ITER = 1024, 1024, 400
img = cp.zeros((H, W), dtype=cp.int32)
threads = (16, 16)
blocks = ((W + 15)//16, (H + 15)//16)
mandel(blocks, threads,
(cp.float32(-2.0), cp.float32(1.0),
cp.float32(-1.5), cp.float32(1.5),
W, H, ITER, img))
cp.cuda.Stream.null.synchronize()
print(f"Mandelbrot performed. max iter reached={int(img.max())}")
plt.determine(figsize=(6,6))
plt.imshow(cp.asnumpy(cp.log1p(img)), cmap='twilight_shifted', extent=[-2,1,-1.5,1.5])
plt.title("Mandelbrot set — computed with a CuPy RawKernel")
plt.axis('off'); plt.present()
header("7. CUDA STREAMS")
s1, s2 = cp.cuda.Stream(non_blocking=True), cp.cuda.Stream(non_blocking=True)
with s1:
a1 = cp.random.rand(2000, 2000, dtype=cp.float32)
b1 = cp.random.rand(2000, 2000, dtype=cp.float32)
c1 = a1 @ b1
with s2:
a2 = cp.random.rand(2000, 2000, dtype=cp.float32)
b2 = cp.random.rand(2000, 2000, dtype=cp.float32)
c2 = a2 @ b2
s1.synchronize(); s2.synchronize()
print(f"Stream-1 imply={float(c1.imply()):.4f}")
print(f"Stream-2 imply={float(c2.imply()):.4f}")
Home Artificial Intelligence Coding implementation to grasp GPU computing utilizing CuPy, customized CUDA kernels, streams, sparse matrices, and profiling
Coding implementation to grasp GPU computing utilizing CuPy, customized CUDA kernels, streams, sparse matrices, and profiling
by root

