API Reference#

Collection of miscelaneous tools useful in a variety of situations (not specific to the current project)

misc_tools.get_end_of_current_month(d)#

Take a datetime and find the last date of the current month and also reset time to zero.

>>> d = pd.to_datetime('2019-10-21')
>>> get_end_of_current_month(d)
Timestamp('2019-10-31 00:00:00')

>>> d = pd.to_datetime('2023-03-31 12:00:00')
>>> get_end_of_current_month(d)
Timestamp('2023-03-31 00:00:00')

From https://stackoverflow.com/a/13565185

misc_tools.get_end_of_current_quarter(d)#

Take a datetime and find the last date of the current quarter and also reset time to zero.

>>> d = pd.to_datetime('2019-10-21')
>>> get_end_of_current_quarter(d)
datetime.datetime(2019, 12, 31, 0, 0)

# TODO: Note that he behavior below may be unwanted. Might consider # fixing in the future >>> d = pd.to_datetime(‘2023-03-31 12:00:00’) >>> get_end_of_current_quarter(d) datetime.datetime(2023, 3, 31, 0, 0)

misc_tools.get_most_recent_quarter_end(d)#

Take a datetime and find the most recent quarter end date

>>> d = pd.to_datetime('2019-10-21')
>>> get_most_recent_quarter_end(d)
datetime.datetime(2019, 9, 30, 0, 0)

misc_tools.get_next_quarter_start(d)#

Take a datetime and find the start date of the next quarter

>>> d = pd.to_datetime('2019-10-21')
>>> get_next_quarter_start(d)
datetime.datetime(2020, 1, 1, 0, 0)

misc_tools.groupby_weighted_average(data_col=None, weight_col=None, by_col=None, data=None, transform=False, new_column_name='')#

Faster method for calculating grouped weighted average.

From: https://stackoverflow.com/a/44683506

Examples

>>> df_nccb = pd.DataFrame({ 
...     'trade_direction': ['RECEIVED', 'RECEIVED', 'DELIVERED'], 
...     'rate': [2, 3, 2], 
...     'start_leg_amount': [100, 200, 100]}, 
... )
>>> groupby_weighted_average(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction')
trade_direction
DELIVERED   2.00
RECEIVED    2.67
dtype: float64

misc_tools.groupby_weighted_quantile(data_col=None, weight_col=None, by_col=None, data=None, transform=False, new_column_name='')#: This can already be accomplished with weighted_quantile, as demonstrated above. This function is for convenience.

misc_tools.groupby_weighted_std(data_col=None, weight_col=None, by_col=None, data=None, ddof=1)#

Method for calculating grouped weighted standard devation.

From: https://stackoverflow.com/a/72915123

Examples

>>> df_nccb = pd.DataFrame({
...     'trade_direction': ['RECEIVED', 'RECEIVED', 'RECEIVED', 'RECEIVED', 
...         'DELIVERED', 'DELIVERED', 'DELIVERED', 'DELIVERED'],
...     'rate': [2, 2, 2, 3, 2, 2, 2, 3],
...     'start_leg_amount': [300, 300, 300, 0, 200, 200, 200, 200]},
... )
>>> groupby_weighted_std(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction', ddof=1)
trade_direction
DELIVERED   0.50
RECEIVED    0.00
dtype: float64
>>> np.std([2,2,2,3], ddof=1)
0.5
>>> np.std([2,2,2], ddof=1)
0.0
>>> groupby_weighted_std(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction', ddof=0)
trade_direction
DELIVERED   0.43
RECEIVED    0.00
dtype: float64
>>> np.std([2,2,2,3])
0.4330127018922193
>>> np.std([2,2,2])
0.0

misc_tools.leave_one_out_sums(df, groupby=[], summed_col='')#

Compute leave-one-out sums,

$x_i = sum_{ell’neqell} w_{i, ell’}$

This is helpful for constructing the shift-share instruments in Borusyak, Hull, Jaravel (2022).

Examples

>>> df = pd.DataFrame({
...     'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
...     'B' : ['one', 'one', 'one', 'two', 'two', 'two'],
...     'C' : [1, 5, 5, 2, 5, 3],
...     'D' : [2.0, 5., 8., 1., 2., 9.],
...     'LOO_Sum_C_groupby_B': [10, 6, 6, 8, 5, 7]
...                })
>>> LOO_Sum_C_groupby_B = df.groupby(['B'])['C'].transform(lambda x: x.sum() - x)
>>> pd.testing.assert_series_equal(
...     df['LOO_Sum_C_groupby_B'], 
...     df.groupby(['B'])['C'].transform(lambda x: x.sum() - x),
...     check_names=False)

>>> s = leave_one_out_sums(df, groupby=['B'], summed_col='C')
>>> pd.testing.assert_series_equal(
...     df['LOO_Sum_C_groupby_B'], 
...     s,
...     check_names=False)

misc_tools.merge_stats(df_left, df_right, on=[])#

Provide statistics to assess the completeness of the merge.

To assess the completeness of the merge, this function counts the number of unique elements in the index of the left and right dataframes. It produces the following:

(First, ensure that the index of left and right are sets of unique elements.)

‘union’: num of elements in union of indices. ‘intersection’: num of elements in intersection of indices ‘union-intersection’: difference between union and intersection (unmatched symmetric) ‘intersection/union’: percentage of symmetric matched ‘left’: num of elements in left index ‘right’: num of elements in in right index ‘left-intersection’: number of excess elements in left index ‘right-intersection’: number of excess elements in right index ‘intersection/left’: percentage of matched based on total in left index ‘intersection/right’: percentage of matched based on total in right index

misc_tools.move_column_inplace(df, col, pos=0)#

https://stackoverflow.com/a/58686641

Use pos=0 to move to the front

misc_tools.move_columns_to_front(df, cols=[])#: Move a list of columns cols so that they appear first

misc_tools.plot_weighted_median_with_distribution_bars(data=None, variable_name=None, date_col='date', weight_col=None, percentile_bars=True, percentiles=[0.25, 0.75], rolling_window=1, rolling=False, rolling_min_periods=None, rescale_factor=1, ax=None, add_quarter_lines=True, ylabel=None, xlabel=None, label=None)#

Plot the weighted median of a variable over time. Optionally, plot the 25th and 75th percentiles

Notes

rolling_window=1 means that there is no rolling aggregation applied.

Shaded region shows 25/75 percentiles’)

other_bbg[‘2019-10-21’:].plot(ax=ax) plt.legend()

fig, ax = plt.subplots() ax = plot_weighted_median_with_distribution_bars(

data=df, variable_name=’rate_SD_spread’, date_col=’date’, weight_col=’Volume’, percentile_bars=True, percentiles=[0.25, 0.75], rolling_window=5, rescale_factor=100, ax=ax, add_quarter_lines=True, ylabel=None, xlabel=None, label=None )

plt.title(‘Volume-weighted median rate spread (bps)

Shaded region shows 25/75 percentiles’)

other_bbg[‘2019-10-21’:].plot(ax=ax) plt.legend() ```

misc_tools.weighted_average(data_col=None, weight_col=None, data=None)#

Simple calculation of weighted average.

Examples

>>> df_nccb = pd.DataFrame({
...     'rate': [2, 3, 2],
...     'start_leg_amount': [100, 200, 100]},
... )
>>> weighted_average(data_col='rate', weight_col='start_leg_amount', data=df_nccb)
2.5

misc_tools.weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False)#

Very close to numpy.percentile, but supports weights.

Parameters:

values:: numpy.array with data
quantiles: array-like with many quantiles needed
sample_weight: array-like of the same length as array
values_sortedbool, Default False: if True, then will avoid sorting of initial array
old_style:: if True, will correct output to be consistent with numpy.percentile.

Returns:

numpy.array: with computed quantiles.

Notes

quantiles should be in [0, 1]!

FROM: https://stackoverflow.com/a/29677616

NOTE: that a groupby weighted quantile can look like this: ``` median_SD_spread = data.groupby(‘date’).apply(

lambda x: weighted_quantile(x[‘rate_SD_spread’], 0.5, sample_weight=x[‘Volume’]))

```

misc_tools.with_lagged_columns(data=None, columns_to_lag=None, id_columns=None, lags=1, date_col='date', prefix='L')#

Add lagged columns to a dataframe.

Examples

>>> a=[[1,'1990/1/1',1],
... [1,'1990/2/1',2],
... [1,'1990/3/1',3],
... [2,'1989/12/1',3],
... [2,'1990/1/1',3],
... [2,'1990/2/1',4],
... [2,'1990/3/1',5.5],
... [2,'1990/4/1',5],
... [2,'1990/6/1',6]]

>>> data=pd.DataFrame(a,columns=['id','date','value'])
>>> data['date']=pd.to_datetime(data['date'])

>>> data
   id       date  value
 1 1990-01-01   1.00
 1 1990-02-01   2.00
 1 1990-03-01   3.00
 2 1989-12-01   3.00
 2 1990-01-01   3.00
 2 1990-02-01   4.00
 2 1990-03-01   5.50
 2 1990-04-01   5.00
 2 1990-06-01   6.00

>>> data_lag = with_lagged_columns(data=data, columns_to_lag=['value'], id_columns=['id'], lags=1)
>>> data_lag
   id       date  value  L1_value
 1 1990-01-01   1.00       NaN
 1 1990-02-01   2.00      1.00
 1 1990-03-01   3.00      2.00
 2 1989-12-01   3.00       NaN
 2 1990-01-01   3.00      3.00
 2 1990-02-01   4.00      3.00
 2 1990-03-01   5.50      4.00
 2 1990-04-01   5.00      5.50
 2 1990-06-01   6.00      5.00