Kernel Interpolation C++

Question

Purpose of this program is to calculate the velocity of ng = nxnynz particles at the Lagrangian points from the values at regular grid nodes. How can i speed up this calculation? I hope to receive suggestions on this following code:

#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath> 
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h>     
#include <algorithm>    

using namespace std;

void functionM4( vector<double> &b, vector<double> &w)   
// this is an interpolation function
{

    w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
    w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
    w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
    w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;

}

int main()
{
    double pi = 3.141592653589793;

    int nx0 = 400;            // number of grid in X direction
    int ny0 = 400;            // number of grid in Y direction
    int nz0 = 400;            // number of grid in Y direction

    int nx = nx0+10;             
    int ny = ny0+10;            
    int nz = nz0+10;           
    int ng = nx*ny*nz;

    double xmin = 0.0;      // set up domain  
    double xmax = 2.0*pi;             
    double ymin = 0.0;            
    double ymax = 2.0*pi;             
    double zmin = 0.0;        
    double zmax = 2.0*pi;   

    double dx = (xmax-xmin)/(nx0);  //spacing step
    double dy = (ymax-ymin)/(ny0);      
    double dz = (zmax-zmin)/(nz0);      
    
    vector<double> x(nx,0.0);           
    vector<double> y(ny,0.0);            
    vector<double> z(nz,0.0);       

    for (int i=0;i<nx;i++)
        x[i] = xmin+(double)(i-5)*dx;  //building grid for x direction
        
    for (int j=0;j<ny;j++)
        y[j] = ymin+(double)(j-5)*dy;

    for (int k=0;k<nz;k++)
        z[k] = zmin+(double)(k-5)*dz;
 // ug is velocity component u on grid at the beginning
    vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));   

// particles at Lagrangian points
    vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

//the particle velocity at Lagrangian locations
    vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));        
    vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    
    int i,j,k;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++)
        for (j=0;j<ny;j++)
            for (k=0;k<nz;k++)
            {
                //initial values of velocity components at grid nodes
                ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
                vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
                wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
            }


    double dt = 0.001;  // time step

//the particle velocity upStar at Lagrangian location xpStar is 
//obtained from the velocity ug at the grid nodes (x,y,z) 
//using a kernel interpolation function.

    cout<<" start "<<endl;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++) 
        for (j=0;j<ny;j++) 
            for (k=0;k<nz;k++){

                xpStar[i][j][k]= x[i] + dt*ug[i][j][k]; 
                ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
                zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
            }

    #pragma omp parallel for private (j,k) schedule(dynamic)   
    for (i=2;i<nx-2;i++) 
        for (j=2;j<ny-2;j++) 
            for (k=2;k<nz-2;k++){


                int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
                int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
                int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);

                int west  = Xindex-1;  int est   = Xindex+2;
                int south = Yindex-1;  int north = Yindex+2;                
                int front = Zindex-1;  int back  = Zindex+2;    

                vector<double> BX(4,0.0);
                vector<double> BY(4,0.0);
                vector<double> BZ(4,0.0);

                vector<double> wx(4,0.0);
                vector<double> wy(4,0.0);
                vector<double> wz(4,0.0);

                for (int m=west;m<=est;m++){
                    BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
    
                for (int n=south;n<=north;n++){
                    BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}

                for (int q=front;q<=back;q++){
                    BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}

                functionM4(BX,wx);
                functionM4(BY,wy);
                functionM4(BZ,wz);

                for (int m=west;m<=est;m++) 
                    for (int n=south;n<=north;n++)
                        for (int q=front;q<=back;q++){ 
                            
                            double w=wx[m-west]*wy[n-south]*wz[q-front];

                            upStar[i][j][k] += ug[m][n][q]*w;
                            vpStar[i][j][k] += vg[m][n][q]*w;
                            wpStar[i][j][k] += wg[m][n][q]*w;
                        }
            }


    cout<<" finish ! "<<endl;

    //-------------------------------------Checking results------------------------------------------//
    vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
    vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

    #pragma omp parallel for
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
                error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
            }

    double aa = 0.0;
    double bb = 0.0;
    double cc = 0.0;
    double dd = 0.0;
    double maxi = 0.0;
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                aa += abs(exact[i][j][k]);
                bb += error[i][j][k];
                cc += error[i][j][k]*error[i][j][k];
                dd += exact[i][j][k]*exact[i][j][k];

                if (abs(error[i][j][k])>maxi)
                    maxi = abs(error[i][j][k]);
            }

    cout<<" L1_norm = "<<bb/aa<<endl;
    cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
    cout<<" L3_norm = "<<maxi<<endl;

    return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}

you should use Horner's rule for computing the elements of w, which should save a few multiplications. — xdavidliu
– xdavidliu, Commented Oct 21, 2020 at 4:19
Using Horner's rule rule, it can save 1 or 2 multiplications. Thank you very much @xdavidliu — Luc Nguyen
– Luc Nguyen, Commented Oct 21, 2020 at 4:25

user673679 · Accepted Answer · 2020-10-22 15:22:20Z

(Note: I don't know anything about OMP, so I haven't commented on it!)

double pi = 3.141592653589793;

int nx0 = 400;            // number of grid in X direction
...

Constants should be declared const (it helps the programmer, if not the compiler).

for (i=0;i<nx;i++)
    for (j=0;j<ny;j++)
        for (k=0;k<nz;k++)
        {
            ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
            //initial values of velocity components at grid nodes
            vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
            wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
        }

I know this is just the setup phase, but that's a lot of repeated cos and sin calculations for the same values. We could move the calculations into the outer loops, or (better) we could pre-calculate sin(x[n]), cos(x[n]), sin(y[n]), etc. separately.

Iterating over all 3 velocity grids at once is likely to be slower than 3 separate loops to fill each one independently. (Due to cache misses).

vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

This is quite complicated in terms of memory allocation so it's slow to create (lots of copying of those one and two dimensional vectors) and to iterate (allocated memory is unlikely to be contiguous, so we don't know how to access elements in a cache-efficient manner).

Instead, try allocating a single vector, and calculating the indices from 3d coordinates, something like:

template<class T>
struct array_3d
{
    using index_t = typename std::vector<T>::size_type;

    array_3d(index_t width, index_t height, index_t depth):
        m_width(width), m_height(height), m_depth(depth),
        m_data(m_width * m_height * m_depth) { }

    T& at(index_t x, index_t y, index_t z)
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }
    
    T const& at(index_t x, index_t y, index_t z) const
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }

    index_t m_width, m_height, m_depth;
    std::vector<T> m_data;
};

If we're careful in how we iterate this data structure, we can avoid skipping around in memory, e.g.:

for (int k=0;k<nz;k++)
    for (int j=0;j<ny;j++)
        for (int i=0;i<nx;i++)
            xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

Every access of xpStar.m_data alters the next element in the array. (So we could even avoid calculating the index from the coordinates and just increment an iterator, which may or may not be faster).

(Note: The way I implemented it above, has the x component innermost (so x+1 is index+1). This is the opposite of your initial code (which has z innermost), but it doesn't really matter which you choose, as long as you loop through the dimensions in the right order when iterating).

            vector<double> BX(4,0.0);
            vector<double> BY(4,0.0);
            vector<double> BZ(4,0.0);

            vector<double> wx(4,0.0);
            vector<double> wy(4,0.0);
            vector<double> wz(4,0.0);

Since these are a fixed size, use std::array<double, 4> instead!

            for (int m=west;m<=est;m++) 
                for (int n=south;n<=north;n++)
                    for (int q=front;q<=back;q++){ 
                        
                        double w=wx[m-west]*wy[n-south]*wz[q-front];

                        upStar[i][j][k] += ug[m][n][q]*w;
                        vpStar[i][j][k] += vg[m][n][q]*w;
                        wpStar[i][j][k] += wg[m][n][q]*w;
                    }

Again, it might be faster to split this into separate loops for upStar, vpStar and wpStar (and then precalculate w), e.g.:

            array<double, 4*4*4> w;
            
            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

Thank you very much for introducing template<class T> struct array_3d replacing 3-D vectors. It is very nice. I tried it, and it works well. Could you explain 'T& at(index_t x, index_t y, index_t z)' and 'T& at(index_t x, index_t y, index_t z) const'? @user673679 — Luc Nguyen
– Luc Nguyen, Commented Oct 22, 2020 at 0:29
I tested the code with separation of loops for upStar, vpStar and wpStar, the calculation speed is not better. However, the way code written is very clear, and helpful. — Luc Nguyen
– Luc Nguyen, Commented Oct 22, 2020 at 0:46
Ah yep, that second at() function should return a const&, my bad. — user673679
– user673679, Commented Oct 22, 2020 at 15:22

Stack Exchange Network

Kernel Interpolation C++

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Kernel Interpolation C++

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions