Source code for pylorenzmie.theory.cupyLorenzMie

from pylorenzmie.theory.LorenzMie import LorenzMie
import cupy as cp
import logging


logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


[docs] class cupyLorenzMie(LorenzMie): ''' Compute scattered light field with CUDA acceleration. ... Inherits -------- pylorenzmie.theory.LorenzMie Properties ---------- double_precision : bool If True, perform GPU calculations in double precision. Default: True Methods ------- field(cartesian=True, bohren=True, gpu=False) Returns the complex-valued field at each of the coordinates. gpu : bool If True, return gpu variable for field. Default [False]: return cpu field ''' method: str = 'cupy numpy' def __init__(self, *args, double_precision: bool = True, **kwargs) -> None: self.double_precision = double_precision super().__init__(*args, **kwargs) @property def double_precision(self) -> bool: return self._double_precision @double_precision.setter def double_precision(self, double_precision: bool) -> None: if double_precision: try: self.lorenzmie = self.culorenzmie() self.dtype = cp.float64 self.ctype = cp.complex128 except cp.cuda.runtime.CUDARuntimeError: logger.warning('GPU not capable of double precision. ' 'Falling back to single precision.') double_precision = False if not double_precision: self.lorenzmie = self.culorenzmief() self.dtype = cp.float32 self.ctype = cp.complex64 self._double_precision = double_precision def _allocate(self) -> None: '''Allocate buffers for calculation''' shape = self.coordinates.shape self._field = cp.empty(shape, dtype=self.ctype) self.buffer = cp.empty(shape, dtype=self.ctype) self.coords = cp.asarray(self.coordinates, self.dtype) self.threadsperblock = 32 self.blockspergrid = ((shape[1] + (self.threadsperblock - 1)) // self.threadsperblock)
[docs] def hologram(self, cartesian: bool = True, bohren: bool = True, device: bool = False) -> cp.ndarray: '''Returns the hologram of the particle Returns ------- hologram : cp.ndarray The hologram of the particle on the GPU Keywords -------- cartesian : bool If True, compute the field in cartesian coordinates. Default: True bohren : bool If True, use Bohren's convention for the field. Default: True ''' field = self.field(cartesian=cartesian, bohren=bohren, device=True) field[0, :] += 1. hologram = cp.sum(field.real**2 + field.imag**2, axis=0) return hologram if device else hologram.get()
[docs] def field(self, cartesian: bool = True, bohren: bool = True, device: bool = False) -> cp.ndarray: '''Returns the field scattered by a particle''' k = self.dtype(self.instrument.wavenumber()) n_m = self.instrument.n_m wavelength = self.instrument.wavelength self._field.fill(0.+0.j) for particle in self.particle: ab = particle.ab(n_m, wavelength) a = cp.asarray(ab[:, 0], dtype=self.ctype) b = cp.asarray(ab[:, 1], dtype=self.ctype) r_p = (particle.r_p + particle.r_0).astype(self.dtype) self.lorenzmie((self.blockspergrid,), (self.threadsperblock,), (*self.coords, self.coords.shape[1], a, b, ab.shape[0], *r_p, k, cartesian, bohren, *self.buffer)) self._field += self.buffer return self._field if device else self._field.get()
[docs] def culorenzmief(self) -> cp.RawKernel: '''Return CUDA kernel for single-precision field computation''' return cp.RawKernel(self._cudalorenzmie, 'lorenzmie')
[docs] def culorenzmie(self) -> cp.RawKernel: '''Return CUDA kernel for double-precision field computation''' change = {'f(': '(', '.f': '.', 'float': 'double', 'Float': 'Double'} code = self._cudalorenzmie for before, after in change.items(): code = code.replace(before, after) return cp.RawKernel(code, 'lorenzmie')
_cudalorenzmie = r''' # include <cuComplex.h> // avoid the overhead of cuC* macros. static __device__ __inline__ cuFloatComplex cmul(const cuFloatComplex &a, const cuFloatComplex &b) { return make_cuFloatComplex(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); } static __device__ __inline__ cuFloatComplex cadd(const cuFloatComplex &a, const cuFloatComplex &b) { return make_cuFloatComplex(a.x + b.x, a.y + b.y); } static __device__ __inline__ cuFloatComplex csub(const cuFloatComplex &a, const cuFloatComplex &b) { return make_cuFloatComplex(a.x - b.x, a.y - b.y); } static __device__ __inline__ cuFloatComplex cdiv(const cuFloatComplex &a, const cuFloatComplex &b) { // a/b = (a * conj(b)) / |b|^2 float denom = b.x*b.x + b.y*b.y; return make_cuFloatComplex((a.x*b.x + a.y*b.y) / denom, (a.y*b.x - a.x*b.y) / denom); } // kernel parameters marked __restrict__/const and read with __ldg where // appropriate to help the optimizer and use the read‑only cache. extern "C" __global__ void lorenzmie(const float * __restrict__ x, const float * __restrict__ y, const float * __restrict__ z, int length, const cuFloatComplex * __restrict__ a, const cuFloatComplex * __restrict__ b, int norders, float x_p, float y_p, float z_p, float k, bool bohren, bool cartesian, cuFloatComplex * __restrict__ e1, cuFloatComplex * __restrict__ e2, cuFloatComplex * __restrict__ e3) { // constant values reused by every thread const cuFloatComplex i = make_cuFloatComplex(0.f, 1.f); const cuFloatComplex one = make_cuFloatComplex(1.f, 0.f); const cuFloatComplex two = make_cuFloatComplex(2.f, 0.f); const cuFloatComplex phase = make_cuFloatComplex(cosf(k*z_p), -sinf(k*z_p)); // sign factor for 'bohren' does not depend on the particle index, // compute it once per kernel invocation. const cuFloatComplex bohrenSign = bohren ? make_cuFloatComplex(0.f, 1.f) : make_cuFloatComplex(0.f, -1.f); // thread index and stride for strided loop int idx = blockIdx.x * blockDim.x + threadIdx.x; const int stride = blockDim.x * gridDim.x; // iterate over particles with a simple strided loop for (; idx < length; idx += stride) { // read coordinate values through __ldg so they may be cached float kx = k * (__ldg(&x[idx]) - x_p); float ky = k * (__ldg(&y[idx]) - y_p); float kz = -k * (__ldg(&z[idx]) - z_p); float krho = hypotf(kx, ky); float kr = hypotf(krho, kz); float phi = atan2f(ky, kx); float theta = atan2f(krho, kz); float sinphi, cosphi, sintheta, costheta, sinkr, coskr; __sincosf(phi, &sinphi, &cosphi); __sincosf(theta, &sintheta, &costheta); __sincosf(kr, &sinkr, &coskr); cuFloatComplex sinp = make_cuFloatComplex(sinphi, 0.f); cuFloatComplex cosp = make_cuFloatComplex(cosphi, 0.f); cuFloatComplex sint = make_cuFloatComplex(sintheta, 0.f); cuFloatComplex cost = make_cuFloatComplex(costheta, 0.f); cuFloatComplex sinkrc = make_cuFloatComplex(sinkr, 0.f); cuFloatComplex coskrc = make_cuFloatComplex(coskr, 0.f); cuFloatComplex krc = make_cuFloatComplex(kr, 0.f); cuFloatComplex inv_kr = make_cuFloatComplex(1.f/kr, 0.f); // sign depending only on kz and bohren flag cuFloatComplex factor = make_cuFloatComplex((kz > 0) - (kz < 0), 0.f); factor = cmul(factor, bohrenSign); cuFloatComplex xi_nm2 = cadd(coskrc, cmul(factor, sinkrc)); cuFloatComplex xi_nm1 = csub(sinkrc, cmul(factor, coskrc)); cuFloatComplex xi_n; cuFloatComplex pi_nm1 = make_cuFloatComplex(0.f,0.f); cuFloatComplex pi_n = make_cuFloatComplex(1.f,0.f); cuFloatComplex mo1nr = make_cuFloatComplex(0.f,0.f); cuFloatComplex mo1nt = make_cuFloatComplex(0.f,0.f); cuFloatComplex mo1np = make_cuFloatComplex(0.f,0.f); cuFloatComplex ne1nr = make_cuFloatComplex(0.f,0.f); cuFloatComplex ne1nt = make_cuFloatComplex(0.f,0.f); cuFloatComplex ne1np = make_cuFloatComplex(0.f,0.f); cuFloatComplex esr = make_cuFloatComplex(0.f,0.f); cuFloatComplex est = make_cuFloatComplex(0.f,0.f); cuFloatComplex esp = make_cuFloatComplex(0.f,0.f); cuFloatComplex imagFactor = one; // (-i)^j accumulator #pragma unroll 8 for (int j = 1; j < norders; ++j) { cuFloatComplex n = make_cuFloatComplex((float)j, 0.f); cuFloatComplex swisc = cmul(pi_n, cost); cuFloatComplex twisc = csub(swisc, pi_nm1); cuFloatComplex tau_n = csub(pi_nm1, cmul(n, twisc)); xi_n = csub(cmul(csub(cmul(two, n), one), cdiv(xi_nm1, krc)), xi_nm2); cuFloatComplex dn = csub(cdiv(cmul(n, xi_n), krc), xi_nm1); mo1nt = cmul(pi_n, xi_n); mo1np = cmul(tau_n, xi_n); ne1nr = cmul(cmul(cmul(n, cadd(n, one)), pi_n), xi_n); ne1nt = cmul(tau_n, dn); ne1np = cmul(pi_n, dn); imagFactor = cmul(imagFactor, i); cuFloatComplex en = cdiv(cdiv(cmul(imagFactor, cadd(cmul(two, n), one)), n), cadd(n, one)); cuFloatComplex aj = a[j]; cuFloatComplex bj = b[j]; esr = cadd(esr, cmul(cmul(cmul(i, en), aj), ne1nr)); est = cadd(est, cmul(cmul(cmul(i, en), aj), ne1nt)); esp = cadd(esp, cmul(cmul(cmul(i, en), aj), ne1np)); esr = csub(esr, cmul(cmul(en, bj), mo1nr)); est = csub(est, cmul(cmul(en, bj), mo1nt)); esp = csub(esp, cmul(cmul(en, bj), mo1np)); // update recursion variables pi_nm1 = pi_n; pi_n = cadd(swisc, cmul(cdiv(cadd(n, one), n), twisc)); xi_nm2 = xi_nm1; xi_nm1 = xi_n; } esr = cmul(esr, cmul(cosp, inv_kr)); esr = cmul(esr, cmul(sint, inv_kr)); est = cmul(est, cmul(cosp, inv_kr)); esp = cmul(esp, cmul(sinp, inv_kr)); if (cartesian) { cuFloatComplex ecx = csub(cadd(cmul(esr, cmul(sint, cosp)), cmul(est, cmul(cost, cosp))), cmul(esp, sinp)); cuFloatComplex ecy = cadd(cmul(esr, cmul(sint, sinp)), cadd(cmul(est, cmul(cost, sinp)), cmul(esp, cosp))); cuFloatComplex ecz = csub(cmul(esr, cost), cmul(est, sint)); e1[idx] = cmul(ecx, phase); e2[idx] = cmul(ecy, phase); e3[idx] = cmul(ecz, phase); } else { e1[idx] = cmul(esr, phase); e2[idx] = cmul(est, phase); e3[idx] = cmul(esp, phase); } } } '''
if __name__ == '__main__': cupyLorenzMie.example(double_precision=True)