import extract_hkm_cmdty
import load_futures_data
import calc_commodities_returns
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pull_bbg_commodities_basis
from replicate_cmdty import generate_corr_matrix, decide_optimal_pairs

import sys

sys.path.append("..")

from settings import config

BASE_DIR = config("BASE_DIR")
DATA_DIR = config("DATA_DIR")

Cleaning Summary: Commodities Returns#

This notebook outlines the steps taken to generate an approximate replication of the commodity return series presented in He, Kelly, and Manela (2017).

Overview#

He, Kelly, and Manela constructed monthly returns for 23 commodities selected from a larger set of 31 commodities originally studied by Yang (2013). These commodities were chosen based on data availability from the Commodity Research Bureau (CRB), requiring at least 25 years of monthly returns data. He, Kelly, and Manela employed an equal-weighted portfolio of all futures contracts with maturities of four months or less, slightly modifying Yang’s original method, which used contracts with maturities between one month and twelve months.

Note: the Yang paper ranges from 197101-200812 and hkm ranges from 198609-201212

However, when attempting direct replication, we encountered significant data limitations. The CRB dataset, as originally utilized by Yang and subsequently He, Kelly, and Manela, is not readily accessible. Matching the precise contract numbers listed in Yang’s Table 1 (“N” values) was also impractical given current dataset constraints.

While it may not be perfectly accurate—since neither the Yang nor the HKM papers specify the yearspan of the commodities they used—we can still make a reasonable inference. Both studies rely on the same underlying dataset. By sorting Yang’s table of commodities by the number of monthly observations (N), we can roughly estimate which commodities were likely selected by HKM, given their stated requirement of at least 25 years of monthly return data.

data = [
    ["Agriculture", "Barley", "WA", 235, -3.66, 27.66, -0.24, 19.62, -1.21],
    ["Agriculture", "Butter", "O2", 141, -3.68, 33.33, 3.66, 27.22, 13.46],
    ["Agriculture", "Canola", "WC", 377, -2.98, 33.16, -0.18, 19.82, -0.89],
    ["Agriculture", "Cocoa", "CC", 452, -2.61, 25.22, 4.52, 30.32, 14.90],
    ["Agriculture", "Coffee", "KC", 420, -2.57, 36.90, 6.00, 36.52, 16.44],
    ["Agriculture", "Corn", "C-", 468, -6.03, 23.08, -0.01, 23.35, -0.04],
    ["Agriculture", "Cotton", "CT", 452, -1.75, 36.50, 3.60, 22.96, 15.69],
    ["Agriculture", "Lumber", "LB", 468, -5.63, 33.55, -1.13, 22.80, -4.98],
    ["Agriculture", "Oats", "O-", 468, -5.65, 31.20, 0.44, 28.90, 1.53],
    ["Agriculture", "Orange juice", "JO", 448, -3.08, 36.61, 2.32, 29.56, 7.86],
    ["Agriculture", "Rough rice", "RR", 265, -7.56, 26.04, -1.50, 25.01, -6.01],
    ["Agriculture", "Soybean meal", "SM", 468, 0.20, 44.87, 7.80, 28.63, 27.25],
    ["Agriculture", "Soybeans", "S-", 468, -0.58, 37.18, 5.99, 26.25, 22.81],
    ["Agriculture", "Wheat", "W-", 468, -2.88, 38.68, 2.79, 23.76, 11.72],
    ["Energy", "Crude oil", "CL", 295, 4.25, 66.78, 10.56, 27.87, 37.89],
    ["Energy", "Gasoline", "RB", 275, 8.09, 70.91, 12.82, 30.18, 42.47],
    ["Energy", "Heating oil", "HO", 345, 1.49, 55.65, 9.50, 28.65, 33.15],
    ["Energy", "Natural gas", "NG", 216, -3.63, 43.06, 8.66, 34.63, 25.00],
    ["Energy", "Propane", "PN", 247, 5.53, 55.47, 14.28, 34.18, 41.77],
    ["Energy", "Unleaded gas", "HU", 250, 8.62, 71.20, 16.02, 29.24, 54.78],
    ["Livestock", "Broilers", "BR", 19, 4.58, 52.63, 1.49, 7.28, 20.53],
    ["Livestock", "Feeder cattle", "FC", 443, 0.35, 53.27, 4.43, 14.28, 31.01],
    ["Livestock", "Lean hogs", "LH", 468, 2.66, 59.40, 7.98, 22.34, 35.70],
    ["Livestock", "Live cattle", "LC", 468, 0.46, 50.64, 4.55, 14.92, 30.46],
    ["Metals", "Aluminum", "AL", 215, 1.06, 35.35, 5.46, 19.11, 28.56],
    ["Metals", "Coal", "QL", 85, -1.55, 34.12, 6.20, 30.02, 20.65],
    ["Metals", "Copper", "HG", 412, 0.52, 41.75, 4.62, 25.50, 18.12],
    ["Metals", "Gold", "GC", 400, -6.24, 0.00, 0.43, 19.88, 2.18],
    ["Metals", "Palladium", "PA", 362, -2.16, 30.66, 10.21, 35.19, 29.01],
    ["Metals", "Platinum", "PL", 410, -3.21, 23.66, 3.69, 27.81, 13.27],
    ["Metals", "Silver", "SI", 419, -6.51, 1.19, 0.44, 32.09, 1.37],
]

columns = [
    "Sector",
    "Commodity",
    "Symbol",
    "N",
    "Basis",
    "Freq. of bw.",
    "E[Rᵉ]",
    "σ[Rᵉ]",
    "Sharpe ratio",
]

df_yang = pd.DataFrame(data, columns=columns)

print(df_yang.sort_values("N", ascending=False).reset_index(drop=True))

         Sector      Commodity Symbol    N  Basis  Freq. of bw.  E[Rᵉ]  σ[Rᵉ]  \
   Livestock      Lean hogs     LH  468   2.66         59.40   7.98  22.34   
 Agriculture          Wheat     W-  468  -2.88         38.68   2.79  23.76   
   Livestock    Live cattle     LC  468   0.46         50.64   4.55  14.92   
 Agriculture           Corn     C-  468  -6.03         23.08  -0.01  23.35   
 Agriculture         Lumber     LB  468  -5.63         33.55  -1.13  22.80   
 Agriculture           Oats     O-  468  -5.65         31.20   0.44  28.90   
 Agriculture   Soybean meal     SM  468   0.20         44.87   7.80  28.63   
 Agriculture       Soybeans     S-  468  -0.58         37.18   5.99  26.25   
 Agriculture          Cocoa     CC  452  -2.61         25.22   4.52  30.32   
 Agriculture         Cotton     CT  452  -1.75         36.50   3.60  22.96   
Agriculture   Orange juice     JO  448  -3.08         36.61   2.32  29.56   
  Livestock  Feeder cattle     FC  443   0.35         53.27   4.43  14.28   
Agriculture         Coffee     KC  420  -2.57         36.90   6.00  36.52   
     Metals         Silver     SI  419  -6.51          1.19   0.44  32.09   
     Metals         Copper     HG  412   0.52         41.75   4.62  25.50   
     Metals       Platinum     PL  410  -3.21         23.66   3.69  27.81   
     Metals           Gold     GC  400  -6.24          0.00   0.43  19.88   
Agriculture         Canola     WC  377  -2.98         33.16  -0.18  19.82   
     Metals      Palladium     PA  362  -2.16         30.66  10.21  35.19   
     Energy    Heating oil     HO  345   1.49         55.65   9.50  28.65   
     Energy      Crude oil     CL  295   4.25         66.78  10.56  27.87   
     Energy       Gasoline     RB  275   8.09         70.91  12.82  30.18   
Agriculture     Rough rice     RR  265  -7.56         26.04  -1.50  25.01   
     Energy   Unleaded gas     HU  250   8.62         71.20  16.02  29.24   
     Energy        Propane     PN  247   5.53         55.47  14.28  34.18   
Agriculture         Barley     WA  235  -3.66         27.66  -0.24  19.62   
     Energy    Natural gas     NG  216  -3.63         43.06   8.66  34.63   
     Metals       Aluminum     AL  215   1.06         35.35   5.46  19.11   
Agriculture         Butter     O2  141  -3.68         33.33   3.66  27.22   
     Metals           Coal     QL   85  -1.55         34.12   6.20  30.02   
  Livestock       Broilers     BR   19   4.58         52.63   1.49   7.28   

    Sharpe ratio  
        35.70  
        11.72  
        30.46  
        -0.04  
        -4.98  
         1.53  
        27.25  
        22.81  
        14.90  
        15.69  
        7.86  
       31.01  
       16.44  
        1.37  
       18.12  
       13.27  
        2.18  
       -0.89  
       29.01  
       33.15  
       37.89  
       42.47  
       -6.01  
       54.78  
       41.77  
       -1.21  
       25.00  
       28.56  
       13.46  
       20.65  
       20.53  

Approach1 – GSCI Indices#

Fortunately, He, Kelly, and Manela mentioned an alternative method tested by Koijen, Moskowitz, Pedersen, and Vrugt (2018) (KMPV), which closely approximated their original findings. The KMPV paper explicitly provided Bloomberg tickers of 24 different commodities, enabling direct extraction of reliable and consistent monthly return data.

Thus, for our replication, we adopted the Bloomberg database following KMPV’s approach. Since KMPV did not explicitly present a calculation formula but rather directly extracted monthly returns from Bloomberg, our replication followed the same straightforward procedure.

df_return1 = load_futures_data.load_gsci_data(data_dir=DATA_DIR / "commodities")
df_return1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 669 entries, 196912 to 202508
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 Date                          669 non-null    datetime64[ns]
 SPGCBRP Index_PX_LAST_Return  319 non-null    float64       
 SPGCGOP Index_PX_LAST_Return  319 non-null    float64       
 SPGCCLP Index_PX_LAST_Return  463 non-null    float64       
 SPGCHUP Index_PX_LAST_Return  451 non-null    float64       
 SPGCHOP Index_PX_LAST_Return  511 non-null    float64       
 SPGCNGP Index_PX_LAST_Return  379 non-null    float64       
 SPGCCTP Index_PX_LAST_Return  583 non-null    float64       
 SPGCKCP Index_PX_LAST_Return  535 non-null    float64       
 SPGCCCP Index_PX_LAST_Return  499 non-null    float64       
SPGCSBP Index_PX_LAST_Return  631 non-null    float64       
SPGCSOP Index_PX_LAST_Return  668 non-null    float64       
SPGCKWP Index_PX_LAST_Return  319 non-null    float64       
SPGCCNP Index_PX_LAST_Return  668 non-null    float64       
SPGCWHP Index_PX_LAST_Return  668 non-null    float64       
SPGCLHP Index_PX_LAST_Return  595 non-null    float64       
SPGCFCP Index_PX_LAST_Return  283 non-null    float64       
SPGCLCP Index_PX_LAST_Return  668 non-null    float64       
SPGCGCP Index_PX_LAST_Return  571 non-null    float64       
SPGCSIP Index_PX_LAST_Return  631 non-null    float64       
SPGCIAP Index_PX_LAST_Return  415 non-null    float64       
SPGCIKP Index_PX_LAST_Return  391 non-null    float64       
SPGCILP Index_PX_LAST_Return  367 non-null    float64       
SPGCIZP Index_PX_LAST_Return  415 non-null    float64       
SPGCICP Index_PX_LAST_Return  583 non-null    float64       
dtypes: datetime64[ns](1), float64(24)
memory usage: 135.9+ KB

df_return1.tail(5)

	Date	SPGCBRP Index_PX_LAST_Return	SPGCGOP Index_PX_LAST_Return	SPGCCLP Index_PX_LAST_Return	SPGCHUP Index_PX_LAST_Return	SPGCHOP Index_PX_LAST_Return	SPGCNGP Index_PX_LAST_Return	SPGCCTP Index_PX_LAST_Return	SPGCKCP Index_PX_LAST_Return	SPGCCCP Index_PX_LAST_Return	...	SPGCLHP Index_PX_LAST_Return	SPGCFCP Index_PX_LAST_Return	SPGCLCP Index_PX_LAST_Return	SPGCGCP Index_PX_LAST_Return	SPGCSIP Index_PX_LAST_Return	SPGCIAP Index_PX_LAST_Return	SPGCIKP Index_PX_LAST_Return	SPGCILP Index_PX_LAST_Return	SPGCIZP Index_PX_LAST_Return	SPGCICP Index_PX_LAST_Return
yyyymm
202504	2025-04-30	-0.176246	-0.114556	-0.179416	-0.115157	-0.103167	-0.215774	-0.023963	0.061948	0.133033	...	0.031488	0.016751	0.023324	0.053582	-0.060372	-0.056096	-0.034990	-0.024421	-0.096317	-0.057642
202505	2025-05-30	0.034989	-0.006110	0.051259	0.013584	0.015693	-0.048471	-0.014541	-0.145477	0.101722	...	0.034167	0.012880	0.026952	-0.009677	0.006123	0.015721	-0.016660	-0.003130	0.009227	0.044607
202506	2025-06-30	0.075806	0.126826	0.089940	0.036585	0.141990	-0.023071	0.010908	-0.118045	-0.017596	...	0.006904	0.039655	0.021615	-0.002324	0.085410	0.064561	-0.005331	0.039348	0.047681	0.040931
202507	2025-07-31	0.092004	0.082365	0.084626	0.069299	0.069372	-0.110300	-0.012916	-0.014328	-0.054888	...	-0.036915	0.067376	0.055640	-0.004224	0.014930	-0.014084	-0.023706	-0.046216	0.004186	-0.032005
202508	2025-08-12	-0.077099	-0.056212	-0.086503	-0.046403	-0.061954	-0.093846	0.016952	0.065448	0.060844	...	0.022607	0.044167	0.026328	0.015051	0.035123	0.020532	0.024847	0.023599	0.032195	0.021583

5 rows × 25 columns

hkm_df = extract_hkm_cmdty.extract_hkm_cmdty(
    data_dir=BASE_DIR / "_data" / "he_kelly_manela"
)

hkm_df.tail(5)

	Commod_01	Commod_02	Commod_03	Commod_04	Commod_05	Commod_06	Commod_07	Commod_08	Commod_09	Commod_10	...	Commod_14	Commod_15	Commod_16	Commod_17	Commod_18	Commod_19	Commod_20	Commod_21	Commod_22	Commod_23
yyyymm
201208	-0.0057	0.1002	0.0883	0.0801	0.0291	0.0444	0.0095	0.1050	0.1311	-0.0655	...	0.0297	0.0611	0.0817	0.1266	-0.0392	0.0656	0.1165	0.0673	-0.0175	0.0100
201209	-0.0558	-0.0366	-0.0479	-0.0950	-0.0196	0.0499	0.0847	-0.0085	-0.0490	0.0518	...	-0.0702	0.0181	0.0800	0.0228	0.0123	-0.0905	0.0951	-0.0907	0.0146	-0.0639
201210	-0.0006	-0.0521	-0.0616	-0.0081	0.0113	-0.0313	-0.0653	-0.0255	-0.0627	-0.1149	...	0.0488	-0.0498	-0.0558	-0.0519	-0.0422	-0.0342	-0.0675	-0.0138	-0.0429	0.0288
201211	-0.0079	0.0504	0.0203	0.0355	-0.0179	-0.0048	0.0329	0.0058	0.1388	-0.0705	...	-0.0611	0.1182	0.0168	0.0480	0.0089	-0.0676	0.0272	-0.0855	-0.0205	-0.0328
201212	-0.0751	-0.1107	0.0197	0.0166	0.0403	-0.0216	0.0007	-0.0109	-0.0572	-0.0461	...	-0.0860	0.0223	-0.0417	0.0125	-0.0268	-0.0150	-0.0963	-0.0262	-0.1042	0.0028

5 rows × 23 columns

Matching Procedure#

Given that He, Kelly, and Manela presented anonymized commodities without explicit tickers or identifiers, we needed a systematic approach to match our Bloomberg-based commodities with the paper’s 23 commodities. We computed the Pearson correlation matrix between our monthly return series and those presented in the paper. Then, we used the linear assignment algorithm to find the optimal one-to-one commodity matches, maximizing the total correlation between the two datasets.

corr_matrix1 = generate_corr_matrix(df_return1.drop(columns=["Date"]), hkm_df)

plt.figure(figsize=(10, 8))
corr_matrix_float1 = corr_matrix1.astype(float)
sns.heatmap(
    corr_matrix_float1,
    annot=False,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar=True,
)

plt.tight_layout()
plt.title("Commodity Return Correlation Matrix")
plt.show()

../_images/d8cecfff6e138671fc42b4b18e8174904b9c324fdd97af823b291df8f1710594.png

optimal_pairs_df1, row_ind, col_ind = decide_optimal_pairs(corr_matrix_float1)

ticker_to_commodity = {
    "SPGCBRP Index": "Crude Oil",
    "SPGCGOP Index": "Gasoil",
    "SPGCCLP Index": "WTI Crude",
    "SPGCHUP Index": "Unl. Gasoline",
    "SPGCHOP Index": "Heating Oil",
    "SPGCNGP Index": "Natural Gas",
    "SPGCCTP Index": "Cotton",
    "SPGCKCP Index": "Coffee",
    "SPGCCCP Index": "Cocoa",
    "SPGCSBP Index": "Sugar",
    "SPGCSOP Index": "Soybeans",
    "SPGCKWP Index": "Kansas Wheat",
    "SPGCCNP Index": "Corn",
    "SPGCWHP Index": "Wheat",
    "SPGCLHP Index": "Lean Hogs",
    "SPGCFCP Index": "Feeder Cattle",
    "SPGCLCP Index": "Live Cattle",
    "SPGCGCP Index": "Gold",
    "SPGCSIP Index": "Silver",
    "SPGCIAP Index": "Aluminum",
    "SPGCIKP Index": "Nickel",
    "SPGCILP Index": "Lead",
    "SPGCIZP Index": "Zinc",
    "SPGCICP Index": "Copper",
}

optimal_pairs_df1["GSCI Index"] = optimal_pairs_df1["Commodity_1"].str.replace(
    "_PX_LAST_Return", "", regex=False
)

# Map to real commodity names
optimal_pairs_df1["Commodity Name"] = optimal_pairs_df1["GSCI Index"].map(
    ticker_to_commodity
)
optimal_pairs_df1 = optimal_pairs_df1.rename(columns={"Commodity_2": "HKM Column Name"})

df1 = df_return1.copy()
df2 = hkm_df.copy()

for r, c in zip(row_ind, col_ind):
    corr_value = corr_matrix1.iloc[r, c]
    if corr_value < 0.85:
        continue  # Skip low correlation pairs

    commodity_1 = corr_matrix1.index[r]
    commodity_2 = corr_matrix1.columns[c]

    merged = (
        pd.DataFrame(
            {
                "Date": df1["Date"],
                f"d1_{commodity_1}": df1[commodity_1],
                f"d2_{commodity_2}": df2[commodity_2],
            }
        )
        .dropna()
        .set_index("Date")
    )

    plt.figure(figsize=(8, 4))
    plt.plot(
        merged.index, merged[f"d1_{commodity_1}"], label=f"{commodity_1}", linewidth=2
    )
    plt.plot(
        merged.index,
        merged[f"d2_{commodity_2}"],
        label=f"{commodity_2}",
        linewidth=2,
        linestyle="--",
    )

    plt.title(
        f"Matched Pair: {commodity_1} & {commodity_2}\nCorrelation = {corr_value:.3f}"
    )
    plt.xlabel("Date")
    plt.ylabel("Monthly Return")
    plt.legend()
    plt.grid(True)

    plt.xticks()
    plt.tight_layout()
    plt.show()

../_images/28ecf909de94fe5423a1c43142e1671cdf6ded90e3dd0ca316a0d06e094c312f.png

../_images/09b01ef53f50a7a23af51b0936bd85e298408404396b9e5958326584a67d9ade.png

../_images/5c13e5b759410cede3ce769aa915fcd3ebe32a7216a679cac575ea07d4a76f6c.png

../_images/af7de04b6832f209934c6bce7e8c1b6c4167026d84d67a89d960ec7d46035066.png

../_images/567d46bed1d94fb412a88f1f14277eb274737ce0cad77e6f1377bf65cf36ebb5.png

../_images/ce5b2934c53d0d959b593f6a4d3d695d2d155ff74191e6551cc0706a24cccbc4.png

../_images/f45645de31b5cd22b9e9fc7098a1cc83cb182c4c98baddbb037c91dd207122e5.png

../_images/4d6f09947effabda2e49680ff6f28b384ad704cc81ac47613913c57b53fda8e6.png

../_images/7848c43960111e7c2fbf977bffcae703382a982e085dc587000c789f2c5564c1.png

../_images/4463eb787b5ae368ec7b3175091cc7db97e8e3cb3822e18a1f812d790ef98f96.png

../_images/3966d6ece3a5780439aa44d4255ee6138a6fdf01dc457c5b78abe931c0b3920e.png

../_images/3c809da58308fba4d7b372aa05e2a2027b0bda679c1e01f2953c8bf1595a9f32.png

../_images/308b7818d0cf7ab3a022f3c173ae5078dfb051b193ff5b7fe531b599649a85a1.png

../_images/d8b9ab71fa99da3bf471c42fe09bc951c2e49834fe74c127f0878d3e669850dd.png

Generate Standardized table#

list_of_return_ticker = optimal_pairs_df1["Commodity_1"].to_list()

def wide_to_long_returns(df, list_of_return_ticker):
    """
    Converts a wide-format return DataFrame to long-format with columns: ds, unique_id, y
    """

    if "yyyymm" in df.columns:
        id_col = "yyyymm"
    elif "Date" in df.columns:
        id_col = "Date"
    else:
        raise ValueError("No date column found!")

    cols = [id_col] + list_of_return_ticker
    df_sub = df[cols].copy()

    df_long = df_sub.melt(
        id_vars=id_col,
        value_vars=list_of_return_ticker,
        var_name="unique_id",
        value_name="y",
    )
    df_long = df_long.rename(columns={id_col: "ds"})

    df_long = df_long.dropna(subset=["y"])

    df_long = df_long.sort_values(["unique_id", "ds"]).reset_index(drop=True)

    return df_long[["unique_id", "ds", "y"]]

gsci_replication_df = wide_to_long_returns(df_return1, list_of_return_ticker)

gsci_replication_df.head(5)

	unique_id	ds	y
0	SPGCBRP Index_PX_LAST_Return	1999-02-26	-0.060203
1	SPGCBRP Index_PX_LAST_Return	1999-03-31	0.359040
2	SPGCBRP Index_PX_LAST_Return	1999-04-30	0.075051
3	SPGCBRP Index_PX_LAST_Return	1999-05-28	-0.057141
4	SPGCBRP Index_PX_LAST_Return	1999-06-30	0.132458

One source of deviation in our replication arises from differences in the underlying universe of commodities. The original HKM dataset selects 23 commodities from a broader pool of 31, whereas our replication process matches 23 out of 24 available contracts. Moreover, the two original pools themselves are not entirely overlapping, making mismatches almost inevitable. Commodities with correlations above 0.90 are likely those that overlap directly between the two datasets, while those with lower correlations often do not appear in the HKM table. Nevertheless, the strong alignment for major contracts supports the validity of using the GSCI index as an effective proxy for replicating the HKM methodology.

This interpretation is further supported by Yang’s table: although our replication remains an approximation—since HKM does not explicitly use GSCI data—we observe that the highly correlated tickers typically have at least 275 monthly observations in Yang’s datasets. This pattern suggests that these commodities were very likely included in the original HKM selection due to their long return histories.

Alternative Approach 1#

To closely follow the methodology of He, Kelly, and Manela (2017), we construct commodity return series based on constant one-month maturity futures prices. In their original approach, HKM derive such prices by linearly interpolating between the nearest, second-nearest, and third-nearest futures contracts. To approximate this, we use London Metal Exchange (LME) data where both cash and three-month futures prices are available, and compute a synthetic one-month price using a linear interpolation: \(P_{1m} = P_{\text{cash}} + \frac{1}{3}(P_{3m} - P_{\text{cash}})\). This allows us to construct a monthly price series that mirrors the one-month maturity logic applied in HKM. For other commodities, we adopt the second-nearest (month-2) Bloomberg generic contracts (e.g., CO2 Comdty, NG2 Comdty) as a practical proxy for the interpolated price. Although this is a simplification, it maintains consistency with the spirit of HKM’s approach and ensures comparability across commodities where interpolation is not feasible.

commodity_futures_df = load_futures_data.load_commodities_future(
    data_dir=DATA_DIR / "commodities"
)
lme_df = load_futures_data.load_lme_metals(data_dir=DATA_DIR / "commodities")

metal_map = {
    "Aluminum": ("LMAHDY Comdty_PX_LAST", "LMAHDS03 Comdty_PX_LAST"),
    "Nickel": ("LMNIDY Comdty_PX_LAST", "LMNIDS03 Comdty_PX_LAST"),
    "Lead": ("LMPBDY Comdty_PX_LAST", "LMPBDS03 Comdty_PX_LAST"),
    "Zinc": ("LMZSDY Comdty_PX_LAST", "LMZSDS03 Comdty_PX_LAST"),
    "Copper": ("LMCADY Comdty_PX_LAST", "LMCADS03 Comdty_PX_LAST"),
}

monthly_1mprice = calc_commodities_returns.calc_lme_monthly_1mprice(
    lme_df, metal_map, date_col="index"
)

lme_monthly_return = calc_commodities_returns.calc_lme_monthly_return(monthly_1mprice)

print(lme_monthly_return.head())

   yyyymm       Date  Aluminum    Nickel      Lead  Zinc    Copper
198709 1987-09-30  0.101317 -0.019891 -0.137613   NaN  0.073357
198710 1987-10-31 -0.058047  0.026383  0.016215   NaN  0.080608
198711 1987-11-30 -0.071148  0.073554  0.127458   NaN  0.378630
198712 1987-12-31  0.204262  0.350032  0.002539   NaN  0.113221
198801 1988-01-31 -0.036728 -0.042974  0.005976   NaN -0.249536

cmt_monthly_return = calc_commodities_returns.compute_second_contract_return(
    commodity_futures_df
)
cmt_monthly_return.head()

	yyyymm	Date	CO2 Comdty_PX_LAST	QS2 Comdty_PX_LAST	CL2 Comdty_PX_LAST	XB2 Comdty_PX_LAST	HO2 Comdty_PX_LAST	NG2 Comdty_PX_LAST	HU2 Comdty_PX_LAST	CT2 Comdty_PX_LAST	...	C 2 Comdty_PX_LAST	W 2 Comdty_PX_LAST	LH2 Comdty_PX_LAST	FC2 Comdty_PX_LAST	LC2 Comdty_PX_LAST	HG2 Comdty_PX_LAST	GC2 Comdty_PX_LAST	SI2 Comdty_PX_LAST	PA2 Comdty_PX_LAST	PL2 Comdty_PX_LAST
0	195907	1959-07-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	195908	1959-08-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.011952	...	-0.015385	0.005096	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	195909	1959-09-30	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-0.003609	...	0.014509	0.018378	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	195910	1959-10-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.005927	...	0.005501	0.016179	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	195911	1959-11-30	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.011457	...	0.000000	-0.004287	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 30 columns

combined_df = pd.merge(
    cmt_monthly_return,
    lme_monthly_return.drop(columns=["Date"]),
    how="inner",
    on=["yyyymm"],
    suffixes=("", "_LME"),
).set_index("yyyymm")

common_idx = hkm_df.index.intersection(combined_df.index)

he_kelly_sub = hkm_df.loc[common_idx]
carry_sub = combined_df.loc[common_idx]

corr_matrix2 = generate_corr_matrix(he_kelly_sub, carry_sub.drop(columns=["Date"]))

plt.figure(figsize=(10, 8))
corr_matrix_float2 = corr_matrix2.astype(float)
sns.heatmap(
    corr_matrix_float2,
    annot=False,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar=True,
)

plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

../_images/028a6a15b58cb90c958d8f505c46dd862d54139d32227b706f061677418d7bf6.png

optimal_pairs_df2, row_ind, col_ind = decide_optimal_pairs(corr_matrix_float2)

ticker_to_commodity = {
    "CO2 Comdty_PX_LAST": "Crude Oil",
    "QS2 Comdty_PX_LAST": "Gasoil",
    "CL2 Comdty_PX_LAST": "WTI Crude",
    "XB2 Comdty_PX_LAST": "Unl. Gasoline",
    "HO2 Comdty_PX_LAST": "Heating Oil",
    "NG2 Comdty_PX_LAST": "Natural Gas",
    "CT2 Comdty_PX_LAST": "Cotton",
    "KC2 Comdty_PX_LAST": "Coffee",
    "CC2 Comdty_PX_LAST": "Cocoa",
    "SB2 Comdty_PX_LAST": "Sugar",
    "S 2 Comdty_PX_LAST": "Soybeans",
    "KW2 Comdty_PX_LAST": "Kansas Wheat",
    "C 2 Comdty_PX_LAST": "Corn",
    "W 2 Comdty_PX_LAST": "Wheat",
    "LH2 Comdty_PX_LAST": "Lean Hogs",
    "FC2 Comdty_PX_LAST": "Feeder Cattle",
    "LC2 Comdty_PX_LAST": "Live Cattle",
    "GC2 Comdty_PX_LAST": "Gold",
    "SI2 Comdty_PX_LAST": "Silver",
    "Aluminum": "Aluminum",
    "Nickel": "Nickel",
    "Lead": "Lead",
    "Zinc": "Zinc",
    "Copper": "Copper",
}


optimal_pairs_df2["Commodity_Name"] = optimal_pairs_df2["Commodity_2"].map(
    ticker_to_commodity
)

optimal_pairs_df2[["Commodity_Name", "Commodity_1", "Correlation"]]

	Commodity_Name	Commodity_1	Correlation
0	Gold	Commod_06	0.996136
1	Silver	Commod_20	0.990287
2	NaN	Commod_16	0.989176
3	Cocoa	Commod_02	0.983713
4	NaN	Commod_15	0.978422
5	Soybeans	Commod_19	0.976728
6	Coffee	Commod_10	0.971527
7	NaN	Commod_21	0.970256
8	NaN	Commod_09	0.969427
9	WTI Crude	Commod_03	0.961483
10	Heating Oil	Commod_08	0.960983
11	Corn	Commod_01	0.953613
12	Copper	Commod_07	0.952118
13	Wheat	Commod_22	0.949708
14	NaN	Commod_14	0.936757
15	Feeder Cattle	Commod_05	0.930403
16	Cotton	Commod_04	0.925083
17	NaN	Commod_11	0.914295
18	NaN	Commod_18	0.868244
19	Crude Oil	Commod_17	0.854062
20	Live Cattle	Commod_12	0.825750
21	Lean Hogs	Commod_13	0.703030
22	Kansas Wheat	Commod_23	0.429840

list_of_return_ticker = optimal_pairs_df2["Commodity_2"].to_list()

gsci_replication_df = wide_to_long_returns(combined_df, list_of_return_ticker)

gsci_replication_df.head(5)

	unique_id	ds	y
0	C 2 Comdty_PX_LAST	1987-09-30	0.130631
1	C 2 Comdty_PX_LAST	1987-10-31	-0.018592
2	C 2 Comdty_PX_LAST	1987-11-30	0.063599
3	C 2 Comdty_PX_LAST	1987-12-31	-0.026718
4	C 2 Comdty_PX_LAST	1988-01-31	0.058824

Alternative Approach 2 – Spot Commodities Futures#

In this approach we use similar method as the previous approach but instead construct a “cash” return panel by combining two sources of prices. For precious metals, we use Bloomberg USD spot series obtained from Curncy tickers (XAUUSD, XAGUSD, XPTUSD, XPDUSD). For other commodities that lack a robust spot series in this module, we use a transparent proxy built from the first generic futures contract (…1 Comdty).

commodity_spot_df = pull_bbg_commodities_basis.load_commodity_spot_proxies(
    data_dir=DATA_DIR / "commodities"
)
precious_metal_spot_df = pull_bbg_commodities_basis.load_precious_metals_spot(
    data_dir=DATA_DIR / "commodities"
)
spot_df = commodity_spot_df.merge(precious_metal_spot_df, on="index")

spot_df.columns

Index(['index', 'CO1 Comdty_PX_LAST', 'QS1 Comdty_PX_LAST',
       'CL1 Comdty_PX_LAST', 'RB1 Comdty_PX_LAST', 'XB1 Comdty_PX_LAST',
       'HO1 Comdty_PX_LAST', 'NG1 Comdty_PX_LAST', 'HU1 Comdty_PX_LAST',
       'CT1 Comdty_PX_LAST', 'KC1 Comdty_PX_LAST', 'CC1 Comdty_PX_LAST',
       'SB1 Comdty_PX_LAST', 'LB1 Comdty_PX_LAST', 'O 1 Comdty_PX_LAST',
       'JO1 Comdty_PX_LAST', 'RR1 Comdty_PX_LAST', 'SM1 Comdty_PX_LAST',
       'WC1 Comdty_PX_LAST', 'S 1 Comdty_PX_LAST', 'KW1 Comdty_PX_LAST',
       'C 1 Comdty_PX_LAST', 'W 1 Comdty_PX_LAST', 'LH1 Comdty_PX_LAST',
       'FC1 Comdty_PX_LAST', 'LC1 Comdty_PX_LAST', 'AL1 Comdty_PX_LAST',
       'HG1 Comdty_PX_LAST', 'GC1 Comdty_PX_LAST', 'SI1 Comdty_PX_LAST',
       'PA1 Comdty_PX_LAST', 'PL1 Comdty_PX_LAST', 'XAUUSD Curncy_PX_LAST',
       'XAGUSD Curncy_PX_LAST', 'XPTUSD Curncy_PX_LAST',
       'XPDUSD Curncy_PX_LAST'],
      dtype='object')

monthly_return = calc_commodities_returns.monthly_returns_mixed_prices(spot_df)

/Users/jbejarano/GitRepositories/ftsfr/src/commodities/calc_commodities_returns.py:322: FutureWarning: The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.
  rets = monthly_px.pct_change()

common_idx = hkm_df.index.intersection(monthly_return.index)

he_kelly_sub = hkm_df.loc[common_idx]
carry_sub = monthly_return.loc[common_idx]

corr_matrix3 = generate_corr_matrix(he_kelly_sub, carry_sub.drop(columns=["Date"]))

plt.figure(figsize=(10, 8))
corr_matrix_float3 = corr_matrix3.astype(float)
sns.heatmap(
    corr_matrix_float3,
    annot=False,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar=True,
)

plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

../_images/125979dcdcf373ee4f51e5d87dfe054031c5c40d66e24ba9ac309b84d26e089d.png

optimal_pairs_df3, row_ind, col_ind = decide_optimal_pairs(corr_matrix_float3)

ticker_to_name = {
    "GC1": "Gold",
    "SI1": "Silver",
    "PL1": "Platinum",
    "CC1": "Cocoa",
    "KC1": "Coffee",
    "S 1": "Soybeans",
    "PA1": "Palladium",
    "JO1": "Orange Juice",
    "C 1": "Corn",
    "CL1": "WTI Crude Oil",
    "SM1": "Soybean Meal",
    "W 1": "Wheat (SRW)",
    "O 1": "Oats",
    "CT1": "Cotton",
    "LB1": "Lumber",
    "HO1": "Heating Oil",
    "FC1": "Feeder Cattle",
    "HG1": "Copper",
    "CO1": "Brent Crude Oil",
    "RR1": "Rough Rice",
    "LC1": "Live Cattle",
    "LH1": "Lean Hogs",
    "KW1": "Kansas Wheat (HRW)",
}
optimal_pairs_df3["Commodity_Name"] = optimal_pairs_df3["Commodity_2"].map(
    ticker_to_name
)

optimal_pairs_df3

	Commodity_1	Commodity_2	Correlation	Commodity_Name
0	Commod_06	GC1	0.995406	Gold
1	Commod_20	SI1	0.994950	Silver
2	Commod_16	PL1	0.989246	Platinum
3	Commod_02	CC1	0.981584	Cocoa
4	Commod_10	KC1	0.977102	Coffee
5	Commod_19	S 1	0.964628	Soybeans
6	Commod_15	PA1	0.961944	Palladium
7	Commod_09	JO1	0.953366	Orange Juice
8	Commod_01	C 1	0.953333	Corn
9	Commod_03	CL1	0.952918	WTI Crude Oil
10	Commod_21	SM1	0.946333	Soybean Meal
11	Commod_22	W 1	0.946209	Wheat (SRW)
12	Commod_14	O 1	0.922051	Oats
13	Commod_04	CT1	0.914998	Cotton
14	Commod_11	LB1	0.914780	Lumber
15	Commod_08	HO1	0.899496	Heating Oil
16	Commod_05	FC1	0.899395	Feeder Cattle
17	Commod_07	HG1	0.873174	Copper
18	Commod_17	CO1	0.848048	Brent Crude Oil
19	Commod_18	RR1	0.843108	Rough Rice
20	Commod_12	LC1	0.762189	Live Cattle
21	Commod_13	LH1	0.741484	Lean Hogs
22	Commod_23	KW1	0.400043	Kansas Wheat (HRW)

Cleaning Summary: Commodities Returns

Contents