Issue #3 clean up data in historical_hkd file

new file: ../docs/dedup.md new file: dedup.py new file: hkd_historical_dedup
2025-12-17 05:04:24 +01:00 · 2023-05-14 22:02:18 -04:00
parent c8b4545f68
commit b36136b087
3 changed files with 107 additions and 0 deletions
--- a/public/dedup.py
+++ b/public/dedup.py
@@ -0,0 +1,35 @@
+# Import the pandas and json libraries
+import pandas as pd
+import json
+
+# Define a function that reads in data from a JSON file, converts it to a DataFrame, removes duplicates, and saves the deduplicated data to a new file
+def get_data_from_file(datafile):
+    # Open the specified JSON file and load its contents into a Python object
+    with open(datafile, 'r') as f:
+        data = json.load(f)
+    # Convert the Python object to a pandas DataFrame
+    df = pd.DataFrame(data)
+
+    # Check for duplicate rows in the DataFrame
+    duplicates = df.duplicated(keep="first")
+    
+    # Remove the duplicate rows from the DataFrame
+    dedup_df = df[~duplicates]
+    
+    # Print the original and deduplicated DataFrame sizes
+    print("Original DataFrame contains {} rows".format(len(df)))
+    print("Deduplicated DataFrame contains {} rows".format(len(dedup_df)))
+    
+    # Identify the dates that were duplicated and removed
+    duplicated_dates = df[duplicates]['date'].unique()
+    print("Dates that were duplicated and removed:")
+    print(duplicated_dates)
+    
+    # Save the deduplicated data to a new JSON file
+    dedup_file = "hkd_historical_dedup"
+    dedup_df.to_json(dedup_file, orient="records")
+
+# If this script is run directly, call the get_data_from_file function with the specified file name
+if __name__ == "__main__":
+    datafile = "hkd_historical"
+    get_data_from_file(datafile)