The thumbnail image is created by chatGPT-4o
This content explains how to implement GPU acceleration on FeniCS using Eigen + Cupy.
1. Install Cupy based on the installed CUDA version
(https://pypi.org/project/cupy-cuda12x/)
pip install cupy-cuda12x
2. Import required library
from fenics import *
import matplotlib.pyplot as plt
import numpy as np
import time
import matplotlib.pyplot as plt
import cupy
import cupyx.scipy.sparse
import cupyx.scipy.sparse.linalg
import scipy.sparse as sps
import scipy.sparse.linalg as spsl
mempool = cupy.get_default_memory_pool()
with cupy.cuda.Device(0):
mempool.set_limit(size=40*1024**3)
parameters['linear_algebra_backend'] = 'Eigen'
Cupy vs. Cupyx
- Cupy: GPU version of Numpy
- Cupyx: GPU version of Scipy
3. Define helper function
def tran2SparseMatrix(A):
row, col, val = as_backend_type(A).data()
return sps.csr_matrix((val, col, row))
scipy.sparse.csr_matrix
: convert sparse matrix into the CSR(Compressed Sparse Row) matrix
- More efficient memory
- Faster matrix-vector multiplication
4. Define simulation parameters
T = 2.0 # final time
num_steps = 5000 # number of time steps
dt = T / num_steps # time step size
mu = 0.001 # dynamic viscosity
rho = 1 # density
5. Load mesh
msh = Mesh()
with XDMFFile(MPI.comm_world, 'cylinder_dense.xdmf') as xdmf_file:
xdmf_file.read(msh)
6. Define Function Space
V = VectorFunctionSpace(msh, 'P', 2)
Q = FunctionSpace(msh, 'P', 1)
7. Define Boundaries
inflow = 'near(x[0], 0)'
outflow = 'near(x[0], 2.2)'
walls = 'near(x[1], 0) || near(x[1], 0.41)'
cylinder = 'on_boundary && x[0]>0.1 && x[0]<0.3 && x[1]>0.1 && x[1]<0.3'
# Define inflow profile
inflow_profile = ('4.0*1.5*x[1]*(0.41 - x[1]) / pow(0.41, 2)', '0')
# Define boundary conditions
bcu_inflow = DirichletBC(V, Expression(inflow_profile, degree=2), inflow)
bcu_walls = DirichletBC(V, Constant((0, 0)), walls)
bcu_cylinder = DirichletBC(V, Constant((0, 0)), cylinder)
bcp_outflow = DirichletBC(Q, Constant(0), outflow)
bcu = [bcu_inflow, bcu_walls, bcu_cylinder]
bcp = [bcp_outflow]
8. Define trial and test functions
u = TrialFunction(V)
v = TestFunction(V)
p = TrialFunction(Q)
q = TestFunction(Q)
u_n = Function(V)
u_ = Function(V)
p_n = Function(Q)
p_ = Function(Q)
9. Define variational forms
# Define expressions used in variational forms
U = 0.5*(u_n + u)
n = FacetNormal(msh)
f = Constant((0, 0))
k = Constant(dt)
mu = Constant(mu)
rho = Constant(rho)
# Define symmetric gradient
def epsilon(u):
return sym(nabla_grad(u))
# Define stress tensor
def sigma(u, p):
return 2*mu*epsilon(u) - p*Identity(len(u))
# Define variational problem for step 1
F1 = rho*dot((u - u_n) / k, v)*dx \
+ rho*dot(dot(u_n, nabla_grad(u_n)), v)*dx \
+ inner(sigma(U, p_n), epsilon(v))*dx \
+ dot(p_n*n, v)*ds - dot(mu*nabla_grad(U)*n, v)*ds \
- dot(f, v)*dx
a1 = lhs(F1)
L1 = rhs(F1)
# Define variational problem for step 2
a2 = dot(nabla_grad(p), nabla_grad(q))*dx
L2 = dot(nabla_grad(p_n), nabla_grad(q))*dx - (1/k)*div(u_)*q*dx
# Define variational problem for step 3
a3 = dot(u, v)*dx
L3 = dot(u_, v)*dx - k*dot(nabla_grad(p_ - p_n), v)*dx
10. Assemble to form matrices
# Assemble matrices
A1 = assemble(a1)
A2 = assemble(a2)
A3 = assemble(a3)
# Apply boundary conditions to matrices
[bc.apply(A1) for bc in bcu]
[bc.apply(A2) for bc in bcp]
11. Create XDMF files for visualization output
xdmffile_u = XDMFFile('navier_GPU/velocity_dense.xdmf')
xdmffile_p = XDMFFile('navier_GPU/pressure_dense.xdmf')
12. Convert assembled matrices to sparse format
A1_gpu = cupyx.scipy.sparse.csr_matrix(tran2SparseMatrix(A1))
A2_gpu = cupyx.scipy.sparse.csr_matrix(tran2SparseMatrix(A2))
A3_gpu = cupyx.scipy.sparse.csr_matrix(tran2SparseMatrix(A3))
cupyx.scipy.sparse.csr_matrix
: convert CSR matrix at CPU to CSR matrix at GPU
13. Allocate GPU memory for RHS vectors
b1_gpu = cupy.zeros_like(cupy.array(assemble(L1)[:]))
b2_gpu = cupy.zeros_like(cupy.array(assemble(L2)[:]))
b3_gpu = cupy.zeros_like(cupy.array(assemble(L3)[:]))
cupy.array
: convert numpy array at CPU to cupy array at GPU
14. Timestep integration
t = 0
start = time.time()
for n in range(num_steps):
start_timestep = time.time()
# Update current time
t += dt
# Step 1: Tentative velocity step
b1 = assemble(L1)
[bc.apply(b1) for bc in bcu]
b1_gpu = cupy.asarray(b1[:])
u_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A1_gpu, b1_gpu)[0])
# Step 2: Pressure correction step
b2 = assemble(L2)
[bc.apply(b2) for bc in bcp]
b2_gpu = cupy.asarray(b2[:])
p_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A2_gpu, b2_gpu)[0])
# Step 3: Velocity correction step
b3_gpu[:] = cupy.asarray(assemble(L3)[:])
u_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A3_gpu, b3_gpu)[0])
# Save solution
xdmffile_u.write(u_, t)
xdmffile_p.write(p_, t)
# Update previous solution
u_n.assign(u_)
p_n.assign(p_)
# print('Time:', t)
# print('u max:', u_.vector().max())
# print('p max:', p_.vector().max())
end_timestep = time.time()
# print('GPU(s)/iteration', end_timestep-start_timestep)
print(n, end_timestep - start, end_timestep - start_timestep)
end = time.time()
print("Total GPU execution time:", end - start)
14.1. IPCS algorithm
# Step 1: Tentative velocity step
b1 = assemble(L1)
[bc.apply(b1) for bc in bcu]
b1_gpu = cupy.asarray(b1[:])
u_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A1_gpu, b1_gpu)[0])
# Step 2: Pressure correction step
b2 = assemble(L2)
[bc.apply(b2) for bc in bcp]
b2_gpu = cupy.asarray(b2[:])
p_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A2_gpu, b2_gpu)[0])
# Step 3: Velocity correction step
b3_gpu[:] = cupy.asarray(assemble(L3)[:])
u_.vector()[:] = cupy.asnumpy(cupyx.scipy.sparse.linalg.cg(A3_gpu, b3_gpu)[0])
cupyx.scipy.sparse.linalg.cg
: solving linear system using cg solver
cupy.asnumpy
: convert the solution vector to CPU
assemble
: assemble across domain to form the RHS vector using CPU
15. Results
15.1. # of Cells = 24,287
- CPUs/iteration: 0.32sec
- GPUs/iteration: 0.16sec
- 2 times faster
15.2. # of Cells = 595,161
- CPUs/iteration: 10.90sec
- GPUs/iteration: 1.62sec
- 6.72 times faster