added 70 characters in body

Source Link

edited Mar 21, 2018 at 7:55

113
6

import pandas as pd
import numpy as np

def synchronize(df,sync_with=[]):
    _df = df.copy()
    
    if not isinstance(sync_with,list):
        sync_with = [sync_with]
    
    _fixed_cols = ['date','agent']
    _fixed_cols.extend(sync_with)
    _colset = [c for c in _df.columns if c not in _fixed_cols]
    
    for ag in _df.agent.unique():
        _df.loc[_df.agent==ag,_colset] = _df.loc[_df.agent==ag,_colset].fillna(method='ffill')
        if sync_with_sync_with:
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].dropna(how='all', subset=sync_withsubset=_sync_with)
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].fillna(method='ffill')
        
    return _df.dropna(how='all', subset=_sync_with)

# 1. default (10.113 ms per loop)
print(synchronize(foo))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
2  1.0  22.0  333.0      1 2012-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
5  5.0  55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

# 2. sync with one column (54.935 ms per loop)
print(synchronize(foo,'_c'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
2  1.0  22.0  333.0      1 2012-01-01
5  NaN 5.0  NaN55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01

# 3. sync with two columns (53.435 ms per loop)
print(synchronize(foo,['_a','_b''_b']))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

import pandas as pd
import numpy as np

def synchronize(df,sync_with=[]):
    _df = df.copy()
    
    if not isinstance(sync_with,list):
        sync_with = [sync_with]
    
    _fixed_cols = ['date','agent']
    _fixed_cols.extend(sync_with)
    _colset = [c for c in _df.columns if c not in _fixed_cols]
    
    for ag in _df.agent.unique():
        _df.loc[_df.agent==ag,_colset] = _df.loc[_df.agent==ag,_colset].fillna(method='ffill')
        if sync_with:
            _df = _df.dropna(how='all', subset=sync_with)
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].fillna(method='ffill')
        
    return _df

# 1. default (10.1 ms per loop)
print(synchronize(foo))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
2  1.0  22.0  333.0      1 2012-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
5  5.0  55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

# 2. sync with one column (54.9 ms per loop)
print(synchronize(foo,'_c'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
2  1.0  22.0  333.0      1 2012-01-01
5  NaN   NaN  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01

# 3. sync with two columns (53.4 ms per loop)
print(synchronize(foo,['_a','_b'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

import pandas as pd
import numpy as np

def synchronize(df,sync_with=[]):
    _df = df.copy()
    
    if not isinstance(sync_with,list):
        sync_with = [sync_with]
    
    _fixed_cols = ['date','agent']
    _fixed_cols.extend(sync_with)
    _colset = [c for c in _df.columns if c not in _fixed_cols]
    
    for ag in _df.agent.unique():
        _df.loc[_df.agent==ag,_colset] = _df.loc[_df.agent==ag,_colset].fillna(method='ffill')
        if _sync_with:
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].dropna(how='all', subset=_sync_with)
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].fillna(method='ffill')

    return _df.dropna(how='all', subset=_sync_with)

# 1. default (13 ms per loop)
print(synchronize(foo))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
2  1.0  22.0  333.0      1 2012-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
5  5.0  55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

# 2. sync with one column (35 ms per loop)
print(synchronize(foo,'_c'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
2  1.0  22.0  333.0      1 2012-01-01
5  5.0  55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01

# 3. sync with two columns (35 ms per loop)
print(synchronize(foo,['_a','_b']))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

Removed superfluous comments

Source Link

edited Mar 20, 2018 at 13:18

301_Moved_Permanently

29.4k
3
49
98

##Question

Dear all,
II have a function that synchronizes according to some specifications columns of a dataframe.
The functions works, nevertheless I was wondering how to:

Please feel free to leave any suggestions,
Thanks.

The function, a sample and results

Function specifications

Sample

Results

##Question

Dear all,
I have a function that synchronizes according to some specifications columns of a dataframe.
The functions works, nevertheless I was wondering how to:

Please feel free to leave any suggestions,
Thanks.

The function, a sample and results

Function specifications

Sample

Results

I have a function that synchronizes according to some specifications columns of a dataframe.
The functions works, nevertheless I was wondering how to:

Please feel free to leave any suggestions.

Function specifications

Sample

Results

Source Link

asked Mar 20, 2018 at 10:44

David Leon

113
6

Pandas dataframe custom forward fillna optimisation

##Question

Dear all,
I have a function that synchronizes according to some specifications columns of a dataframe.
The functions works, nevertheless I was wondering how to:

Improve performances
Make it more pythonic

Please feel free to leave any suggestions,
Thanks.

The function, a sample and results

Function specifications

Inputs:

df : a dataframe with columns:
- [a0,...aN] : a0 to aN names can be any valid string and contains numeric values
- [agent,date] : are fixed names, agent contains numeric values and date contains datetime.
sync_with : The columns to synchronize with (a string or an listof string contained in [a0,..., aN] or, by default, an empty list to synchronize all the [a0,...,aN].

Synchronization:

Do a forward fillna grouped by agent values.
Drop the rows where all columns to synchronize with values are empty

Returns : the synchronized dataframe

Here is my function:

import pandas as pd
import numpy as np

def synchronize(df,sync_with=[]):
    _df = df.copy()
    
    if not isinstance(sync_with,list):
        sync_with = [sync_with]
    
    _fixed_cols = ['date','agent']
    _fixed_cols.extend(sync_with)
    _colset = [c for c in _df.columns if c not in _fixed_cols]
    
    for ag in _df.agent.unique():
        _df.loc[_df.agent==ag,_colset] = _df.loc[_df.agent==ag,_colset].fillna(method='ffill')
        if sync_with:
            _df = _df.dropna(how='all', subset=sync_with)
            _df.loc[_df.agent==ag,:] = _df.loc[_df.agent==ag,:].fillna(method='ffill')
        
    return _df

Sample

foo = pd.DataFrame(dict(date=pd.to_datetime(['2010', '2011', '2012', '2013', '2010', '2013', '2015', '2016']),
                        agent=[1,1,1,1,2,2,2,2],
                        _a=[1, np.nan, np.nan, 4, 5, np.nan, 7, 8],
                        _b=[11, 22, np.nan, np.nan, 55, np.nan, 77, np.nan],
                        _c=[111, np.nan, 333, np.nan, np.nan, 666, 777, np.nan]))

Results

# 1. default (10.1 ms per loop)
print(synchronize(foo))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
2  1.0  22.0  333.0      1 2012-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
5  5.0  55.0  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

# 2. sync with one column (54.9 ms per loop)
print(synchronize(foo,'_c'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
2  1.0  22.0  333.0      1 2012-01-01
5  NaN   NaN  666.0      2 2013-01-01
6  7.0  77.0  777.0      2 2015-01-01

# 3. sync with two columns (53.4 ms per loop)
print(synchronize(foo,['_a','_b'))
    _a    _b     _c  agent       date
0  1.0  11.0  111.0      1 2010-01-01
1  1.0  22.0  111.0      1 2011-01-01
3  4.0  22.0  333.0      1 2013-01-01
4  5.0  55.0    NaN      2 2010-01-01
6  7.0  77.0  777.0      2 2015-01-01
7  8.0  77.0  777.0      2 2016-01-01

Stack Exchange Network

Return to Question

The function, a sample and results

Function specifications

Function specifications

Sample

Sample

Results

Results

The function, a sample and results

Function specifications

Sample

Results

Function specifications

Sample

Results

Pandas dataframe custom forward fillna optimisation

The function, a sample and results

Function specifications

Sample

Results