API Reference#
Collection of miscelaneous tools useful in a variety of situations (not specific to the current project)
- misc_tools.get_end_of_current_month(d)#
Take a datetime and find the last date of the current month and also reset time to zero.
>>> d = pd.to_datetime('2019-10-21') >>> get_end_of_current_month(d) Timestamp('2019-10-31 00:00:00')
>>> d = pd.to_datetime('2023-03-31 12:00:00') >>> get_end_of_current_month(d) Timestamp('2023-03-31 00:00:00')
- misc_tools.get_end_of_current_quarter(d)#
Take a datetime and find the last date of the current quarter and also reset time to zero.
>>> d = pd.to_datetime('2019-10-21') >>> get_end_of_current_quarter(d) datetime.datetime(2019, 12, 31, 0, 0)
# TODO: Note that he behavior below may be unwanted. Might consider # fixing in the future >>> d = pd.to_datetime(‘2023-03-31 12:00:00’) >>> get_end_of_current_quarter(d) datetime.datetime(2023, 3, 31, 0, 0)
- misc_tools.get_most_recent_quarter_end(d)#
Take a datetime and find the most recent quarter end date
>>> d = pd.to_datetime('2019-10-21') >>> get_most_recent_quarter_end(d) datetime.datetime(2019, 9, 30, 0, 0)
- misc_tools.get_next_quarter_start(d)#
Take a datetime and find the start date of the next quarter
>>> d = pd.to_datetime('2019-10-21') >>> get_next_quarter_start(d) datetime.datetime(2020, 1, 1, 0, 0)
- misc_tools.groupby_weighted_average(data_col=None, weight_col=None, by_col=None, data=None, transform=False, new_column_name='')#
Faster method for calculating grouped weighted average.
From: https://stackoverflow.com/a/44683506
Examples
>>> df_nccb = pd.DataFrame({ ... 'trade_direction': ['RECEIVED', 'RECEIVED', 'DELIVERED'], ... 'rate': [2, 3, 2], ... 'start_leg_amount': [100, 200, 100]}, ... ) >>> groupby_weighted_average(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction') trade_direction DELIVERED 2.00 RECEIVED 2.67 dtype: float64
- misc_tools.groupby_weighted_quantile(data_col=None, weight_col=None, by_col=None, data=None, transform=False, new_column_name='')#
This can already be accomplished with weighted_quantile, as demonstrated above. This function is for convenience.
- misc_tools.groupby_weighted_std(data_col=None, weight_col=None, by_col=None, data=None, ddof=1)#
Method for calculating grouped weighted standard devation.
From: https://stackoverflow.com/a/72915123
Examples
>>> df_nccb = pd.DataFrame({ ... 'trade_direction': ['RECEIVED', 'RECEIVED', 'RECEIVED', 'RECEIVED', ... 'DELIVERED', 'DELIVERED', 'DELIVERED', 'DELIVERED'], ... 'rate': [2, 2, 2, 3, 2, 2, 2, 3], ... 'start_leg_amount': [300, 300, 300, 0, 200, 200, 200, 200]}, ... ) >>> groupby_weighted_std(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction', ddof=1) trade_direction DELIVERED 0.50 RECEIVED 0.00 dtype: float64 >>> np.std([2,2,2,3], ddof=1) 0.5 >>> np.std([2,2,2], ddof=1) 0.0 >>> groupby_weighted_std(data=df_nccb, data_col='rate', weight_col='start_leg_amount', by_col='trade_direction', ddof=0) trade_direction DELIVERED 0.43 RECEIVED 0.00 dtype: float64 >>> np.std([2,2,2,3]) 0.4330127018922193 >>> np.std([2,2,2]) 0.0
- misc_tools.leave_one_out_sums(df, groupby=[], summed_col='')#
Compute leave-one-out sums,
$x_i = sum_{ell’neqell} w_{i, ell’}$
This is helpful for constructing the shift-share instruments in Borusyak, Hull, Jaravel (2022).
Examples
>>> df = pd.DataFrame({ ... 'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'], ... 'B' : ['one', 'one', 'one', 'two', 'two', 'two'], ... 'C' : [1, 5, 5, 2, 5, 3], ... 'D' : [2.0, 5., 8., 1., 2., 9.], ... 'LOO_Sum_C_groupby_B': [10, 6, 6, 8, 5, 7] ... }) >>> LOO_Sum_C_groupby_B = df.groupby(['B'])['C'].transform(lambda x: x.sum() - x) >>> pd.testing.assert_series_equal( ... df['LOO_Sum_C_groupby_B'], ... df.groupby(['B'])['C'].transform(lambda x: x.sum() - x), ... check_names=False)
>>> s = leave_one_out_sums(df, groupby=['B'], summed_col='C') >>> pd.testing.assert_series_equal( ... df['LOO_Sum_C_groupby_B'], ... s, ... check_names=False)
- misc_tools.merge_stats(df_left, df_right, on=[])#
Provide statistics to assess the completeness of the merge.
To assess the completeness of the merge, this function counts the number of unique elements in the index of the left and right dataframes. It produces the following:
(First, ensure that the index of left and right are sets of unique elements.)
‘union’: num of elements in union of indices. ‘intersection’: num of elements in intersection of indices ‘union-intersection’: difference between union and intersection (unmatched symmetric) ‘intersection/union’: percentage of symmetric matched ‘left’: num of elements in left index ‘right’: num of elements in in right index ‘left-intersection’: number of excess elements in left index ‘right-intersection’: number of excess elements in right index ‘intersection/left’: percentage of matched based on total in left index ‘intersection/right’: percentage of matched based on total in right index
- misc_tools.move_column_inplace(df, col, pos=0)#
https://stackoverflow.com/a/58686641
Use pos=0 to move to the front
- misc_tools.move_columns_to_front(df, cols=[])#
Move a list of columns cols so that they appear first
- misc_tools.plot_weighted_median_with_distribution_bars(data=None, variable_name=None, date_col='date', weight_col=None, percentile_bars=True, percentiles=[0.25, 0.75], rolling_window=1, rolling=False, rolling_min_periods=None, rescale_factor=1, ax=None, add_quarter_lines=True, ylabel=None, xlabel=None, label=None)#
Plot the weighted median of a variable over time. Optionally, plot the 25th and 75th percentiles
Notes
rolling_window=1 means that there is no rolling aggregation applied.
- Shaded region shows 25/75 percentiles’)
other_bbg[‘2019-10-21’:].plot(ax=ax) plt.legend()
fig, ax = plt.subplots() ax = plot_weighted_median_with_distribution_bars(
data=df, variable_name=’rate_SD_spread’, date_col=’date’, weight_col=’Volume’, percentile_bars=True, percentiles=[0.25, 0.75], rolling_window=5, rescale_factor=100, ax=ax, add_quarter_lines=True, ylabel=None, xlabel=None, label=None )
plt.title(‘Volume-weighted median rate spread (bps)
- Shaded region shows 25/75 percentiles’)
- misc_tools.weighted_average(data_col=None, weight_col=None, data=None)#
Simple calculation of weighted average.
Examples
>>> df_nccb = pd.DataFrame({ ... 'rate': [2, 3, 2], ... 'start_leg_amount': [100, 200, 100]}, ... ) >>> weighted_average(data_col='rate', weight_col='start_leg_amount', data=df_nccb) 2.5
- misc_tools.weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False)#
Very close to numpy.percentile, but supports weights.
- Parameters:
- values:
numpy.array with data
- quantiles
array-like with many quantiles needed
- sample_weight
array-like of the same length as array
- values_sortedbool, Default False
if True, then will avoid sorting of initial array
- old_style:
if True, will correct output to be consistent with numpy.percentile.
- Returns:
- numpy.array
with computed quantiles.
Notes
quantiles should be in [0, 1]!
FROM: https://stackoverflow.com/a/29677616
NOTE: that a groupby weighted quantile can look like this: ``` median_SD_spread = data.groupby(‘date’).apply(
lambda x: weighted_quantile(x[‘rate_SD_spread’], 0.5, sample_weight=x[‘Volume’]))
- misc_tools.with_lagged_columns(data=None, columns_to_lag=None, id_columns=None, lags=1, date_col='date', prefix='L')#
Add lagged columns to a dataframe.
Examples
>>> a=[[1,'1990/1/1',1], ... [1,'1990/2/1',2], ... [1,'1990/3/1',3], ... [2,'1989/12/1',3], ... [2,'1990/1/1',3], ... [2,'1990/2/1',4], ... [2,'1990/3/1',5.5], ... [2,'1990/4/1',5], ... [2,'1990/6/1',6]]
>>> data=pd.DataFrame(a,columns=['id','date','value']) >>> data['date']=pd.to_datetime(data['date'])
>>> data id date value 0 1 1990-01-01 1.00 1 1 1990-02-01 2.00 2 1 1990-03-01 3.00 3 2 1989-12-01 3.00 4 2 1990-01-01 3.00 5 2 1990-02-01 4.00 6 2 1990-03-01 5.50 7 2 1990-04-01 5.00 8 2 1990-06-01 6.00
>>> data_lag = with_lagged_columns(data=data, columns_to_lag=['value'], id_columns=['id'], lags=1) >>> data_lag id date value L1_value 0 1 1990-01-01 1.00 NaN 1 1 1990-02-01 2.00 1.00 2 1 1990-03-01 3.00 2.00 3 2 1989-12-01 3.00 NaN 4 2 1990-01-01 3.00 3.00 5 2 1990-02-01 4.00 3.00 6 2 1990-03-01 5.50 4.00 7 2 1990-04-01 5.00 5.50 8 2 1990-06-01 6.00 5.00