Skip to content

Instantly share code, notes, and snippets.

@patrick-weiss
Last active January 2, 2024 19:17
Show Gist options
  • Save patrick-weiss/86ddef6de978fbdfb22609a7840b5d8b to your computer and use it in GitHub Desktop.
Save patrick-weiss/86ddef6de978fbdfb22609a7840b5d8b to your computer and use it in GitHub Desktop.
Python function to clean enhanced TRACE from "Tidy Finance with Python"
# Cleaning code for enhanced TRACE
# Check out https://www.tidy-finance.org/python/trace-and-fisd.html for reference
# Code based on procedure described in https://openbondassetpricing.com/
def clean_enhanced_trace(
cusips,
connection,
start_date="'01/01/2002'",
end_date="'12/31/2023'"
):
import pandas as pd
import numpy as np
# Load main file ------------------------------------------------#
trace_query = (
"""SELECT cusip_id, bond_sym_id, trd_exctn_dt,
trd_exctn_tm, days_to_sttl_ct, lckd_in_ind,
wis_fl, sale_cndtn_cd, msg_seq_nb,
trc_st, trd_rpt_dt, trd_rpt_tm,
entrd_vol_qt, rptd_pr, yld_pt,
asof_cd, orig_msg_seq_nb, rpt_side_cd,
cntra_mp_id, stlmnt_dt, spcl_trd_fl """ +
"FROM trace.trace_enhanced " +
"WHERE cusip_id IN " + cusips + " " +
"AND trd_exctn_dt BETWEEN " + start_date + " AND " + end_date
)
trace_all = (pd.read_sql_query(
sql=trace_query,
con=connection,
parse_dates={"trd_exctn_dt","trd_rpt_dt", "stlmnt_dt"}
)
)
# Post 2012-06-02 -------------------------------------------------#
# Trades (trc_st = T) and correction (trc_st = R)
trace_post_TR = (trace_all
.query("trc_st in ['T', 'R']")
.query("trd_rpt_dt >= '2012-06-02'")
)
# Cancellations (trc_st = X) and correction cancellations (trc_st = C) -----#
trace_post_XC = (trace_all
.query("trc_st in ['X', 'C']")
.query("trd_rpt_dt >= '2012-06-02'")
.get(["cusip_id", "msg_seq_nb", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
.assign(drop=True)
)
# Cleaning corrected and cancelled trades
trace_post_TR = (trace_post_TR
.merge(trace_post_XC, how="left")
.query("drop != True")
.drop(columns="drop")
)
# Reversals (trc_st = Y) -----#
trace_post_Y = (trace_all
.query("trc_st == 'Y'")
.query("trd_rpt_dt >= '2012-06-02'")
.get(["cusip_id", "orig_msg_seq_nb", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
.assign(drop=True)
.rename(columns={"orig_msg_seq_nb": "msg_seq_nb"})
)
# Clean reversals
## match the orig_msg_seq_nb of the Y-message to
## the msg_seq_nb of the main message
trace_post = (trace_post_TR
.merge(trace_post_Y, how="left")
.query("drop != True")
.drop(columns="drop")
)
# Pre 06-02-12 --------------------------------------------------#
# Trades (trc_st = T)
trace_pre_T = (trace_all
# .query("trc_st == 'T'")
.query("trd_rpt_dt < '2012-06-02'")
)
# Cancelations (trc_st = C) -----#
trace_pre_C = (trace_all
.query("trc_st == 'C'")
.query("trd_rpt_dt < '2012-06-02'")
.get(["cusip_id", "orig_msg_seq_nb", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
.assign(drop=True)
.rename(columns={"orig_msg_seq_nb": "msg_seq_nb"})
)
# Remove cancellations from trades
## match the orig_msg_seq_nb of the C-message
## to the msg_seq_nb of the main message
trace_pre_T = (trace_pre_T
.merge(trace_pre_C, how="left")
.query("drop != True")
.drop(columns="drop")
)
# Corrections (trc_st = W) - can also correct a previous W -----#
trace_pre_W = (trace_all
.query("trc_st == 'W'")
.query("trd_rpt_dt < '2012-06-02'")
)
# Implement corrections in a loop
## Correction control
correction_control = len(trace_pre_W)
correction_control_last = len(trace_pre_W)
## Correction loop
while (correction_control > 0):
# Create placeholder
## Only identifying columns of trace_pre_T (for joins)
placeholder_trace_pre_T = (trace_pre_T
.get(["cusip_id", "trd_exctn_dt", "msg_seq_nb"])
.rename(columns={"msg_seq_nb": "orig_msg_seq_nb"})
.assign(matched_T=True)
)
# Corrections that correct some msg
trace_pre_W_correcting = (trace_pre_W
.merge(placeholder_trace_pre_T, how="left")
.query("matched_T == True")
.drop(columns="matched_T")
)
# Corrections that do not correct some msg
trace_pre_W = (trace_pre_W
.merge(placeholder_trace_pre_T, how="left")
.query("matched_T != True")
.drop(columns="matched_T")
)
# Create placeholder
## Only identifying columns of trace_pre_W_correcting (for anti-joins)
placeholder_trace_pre_W_correcting = (trace_pre_W_correcting
.get(["cusip_id", "trd_exctn_dt", "orig_msg_seq_nb"])
.rename(columns={"orig_msg_seq_nb": "msg_seq_nb"})
.assign(corrected=True)
)
# Delete msgs that are corrected
trace_pre_T = (trace_pre_T
.merge(placeholder_trace_pre_W_correcting, how="left")
.query("corrected != True")
.drop(columns="corrected")
)
# Add correction msgs
trace_pre_T = pd.concat([trace_pre_T, trace_pre_W_correcting])
# Escape if no corrections remain or they cannot be matched
correction_control = len(trace_pre_W)
if correction_control == correction_control_last:
break
else:
correction_control_last = len(trace_pre_W)
continue
# Reversals (asof_cd = R) -----#
## Record reversals
trace_pre_R = (trace_pre_T
.query("asof_cd == 'R'")
.sort_values(["cusip_id", "trd_exctn_dt",
"trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm"])
)
## Prepare final data
trace_pre = (trace_pre_T
.query("asof_cd == None | asof_cd.isnull() | asof_cd not in ['R', 'X', 'D']")
.sort_values(["cusip_id", "trd_exctn_dt",
"trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm"])
)
## Add grouped row numbers
trace_pre_R["seq"] = (trace_pre_R
.groupby(["cusip_id", "trd_exctn_dt", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id"])
.cumcount()
)
trace_pre["seq"] = (trace_pre
.groupby(["cusip_id", "trd_exctn_dt", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id"])
.cumcount()
)
## Select columns for reversal cleaning
trace_pre_R = (trace_pre_R
.get(["cusip_id", "trd_exctn_dt", "entrd_vol_qt",
"rptd_pr", "rpt_side_cd", "cntra_mp_id", "seq"])
.assign(reversal=True)
)
## Remove reversals and the reversed trade
trace_pre = (trace_pre
.merge(trace_pre_R, how="left")
.query("reversal != True")
.drop(columns=["reversal", "seq"])
)
# Agency trades -------------------------------------------------#
# Combine pre and post trades
trace_clean = pd.concat([trace_pre, trace_post])
# Keep agency sells and unmatched agency buys
## Agency sells
trace_agency_sells = (trace_clean
.query("cntra_mp_id == 'D' & rpt_side_cd == 'S'")
)
# Placeholder for trace_agency_sells with relevant columns
placeholder_trace_agency_sells = (trace_agency_sells
.get(["cusip_id", "trd_exctn_dt",
"entrd_vol_qt", "rptd_pr"])
.assign(matched=True)
)
# Agency buys that are unmatched
trace_agency_buys_filtered = (trace_clean
.query("cntra_mp_id == 'D' & rpt_side_cd == 'B'")
.merge(placeholder_trace_agency_sells, how="left")
.query("matched != True")
.drop(columns="matched")
)
# Non-agency
trace_nonagency = (trace_clean
.query("cntra_mp_id == 'C'")
)
# Agency cleaned
trace_clean = pd.concat([trace_nonagency,trace_agency_sells,trace_agency_buys_filtered])
# Additional Filters --------------------------------------------#
trace_add_filters = (trace_clean
.assign(days_to_sttl_ct2 = lambda x: (x["stlmnt_dt"] - x["trd_exctn_dt"]).dt.days)
.assign(days_to_sttl_ct = lambda x: pd.to_numeric(x["days_to_sttl_ct"], errors='coerce'))
.query("days_to_sttl_ct.isnull() | days_to_sttl_ct <= 7")
.query("days_to_sttl_ct2.isnull() | days_to_sttl_ct2 <= 7")
.query("wis_fl == 'N'")
.query("spcl_trd_fl.isnull() | spcl_trd_fl == ''")
.query("asof_cd.isnull() | asof_cd == ''")
)
# Output --------------------------------------------------------#
# Only keep necessary columns
trace_final = (trace_add_filters
.sort_values(["cusip_id", "trd_exctn_dt", "trd_exctn_tm"])
.get(["cusip_id", "trd_exctn_dt", "trd_exctn_tm",
"rptd_pr", "entrd_vol_qt", "yld_pt", "rpt_side_cd", "cntra_mp_id"])
)
# Return
return trace_final
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment