Issue #3 clean up data in historical_hkd file

new file: ../docs/dedup.md new file: dedup.py new file: hkd_historical_dedup
2025-12-17 13:14:25 +01:00 · 2023-05-14 22:02:18 -04:00
parent c8b4545f68
commit b36136b087
3 changed files with 107 additions and 0 deletions
--- a/docs/dedup.md
+++ b/docs/dedup.md
@@ -0,0 +1,71 @@
+```
+# Import the pandas and json libraries
+import pandas as pd
+import json
+
+# Define a function that reads in data from a JSON file, converts it to a DataFrame, removes duplicates, and saves the deduplicated data to a new file
+def get_data_from_file(datafile):
+    # Open the specified JSON file and load its contents into a Python object
+    with open(datafile, 'r') as f:
+        data = json.load(f)
+    # Convert the Python object to a pandas DataFrame
+    df = pd.DataFrame(data)
+
+    # Check for duplicate rows in the DataFrame
+    duplicates = df.duplicated(keep="first")
+    
+    # Remove the duplicate rows from the DataFrame
+    dedup_df = df[~duplicates]
+    
+    # Print the original and deduplicated DataFrame sizes
+    print("Original DataFrame contains {} rows".format(len(df)))
+    print("Deduplicated DataFrame contains {} rows".format(len(dedup_df)))
+    
+    # Identify the dates that were duplicated and removed
+    duplicated_dates = df[duplicates]['date'].unique()
+    print("Dates that were duplicated and removed:")
+    print(duplicated_dates)
+    
+    # Save the deduplicated data to a new JSON file
+    dedup_file = "hkd_historical_dedup"
+    dedup_df.to_json(dedup_file, orient="records")
+
+# If this script is run directly, call the get_data_from_file function with the specified file name
+if __name__ == "__main__":
+    datafile = "hkd_historical"
+    get_data_from_file(datafile)
+```
+This code imports the pandas and json libraries, defines a function get_data_from_file that reads in data from a JSON file, removes any duplicate rows, and saves the deduplicated data to a new file. The function is then called on the file "hkd_historical" when the script is run.
+
+The import pandas as pd statement imports the pandas library and assigns it the alias pd.
+
+The import json statement imports the json library.
+
+The def get_data_from_file(datafile): line defines a function that takes a single argument, datafile.
+
+The with open(datafile, 'r') as f: statement opens the specified JSON file in read mode and assigns the resulting file object to the variable f. The with statement ensures that the file is properly closed when the block is exited.
+
+The data = json.load(f) statement loads the contents of the JSON file into a Python object and assigns it to the variable data.
+
+The df = pd.DataFrame(data) statement converts the Python object to a pandas DataFrame and assigns it to the variable df.
+
+The duplicates = df.duplicated(keep="first") statement checks for duplicate rows in the DataFrame and assigns a boolean Series indicating whether each row is a duplicate to the variable duplicates.
+
+The dedup_df = df[~duplicates] statement selects only the rows in the DataFrame that are not duplicates and assigns the resulting DataFrame to the variable dedup_df.
+
+The print("Original DataFrame contains {} rows".format(len(df))) and print("Deduplicated DataFrame contains {} rows".format(len(dedup_df))) statements print the sizes of the original and deduplicated DataFrames, respectively.
+
+The duplicated_dates = df[duplicates]['date'].unique() statement selects the dates that were duplicated and removes any duplicates from the resulting array, then assigns the resulting array to the variable duplicated_dates.
+
+The print("Dates that were duplicated and removed:") and `print(d
+
+The print("Dates that were duplicated and removed:") and print(duplicated_dates) statements print the dates that were duplicated and removed from the original DataFrame.
+
+The dedup_file = "hkd_historical_dedup" statement assigns the string "hkd_historical_dedup" to the variable dedup_file.
+
+The dedup_df.to_json(dedup_file, orient="records") statement saves the deduplicated data to a new JSON file with the name specified in dedup_file, using the "records" orientation.
+
+The if __name__ == "__main__": statement checks whether the script is being run directly (as opposed to being imported as a module).
+
+The datafile = "hkd_historical" statement assigns the string "hkd_historical" to the variable datafile.
+The get_data_from_file(datafile) statement calls the get_data_from_file function with datafile as its argument.
--- a/public/dedup.py
+++ b/public/dedup.py
@@ -0,0 +1,35 @@
+# Import the pandas and json libraries
+import pandas as pd
+import json
+
+# Define a function that reads in data from a JSON file, converts it to a DataFrame, removes duplicates, and saves the deduplicated data to a new file
+def get_data_from_file(datafile):
+    # Open the specified JSON file and load its contents into a Python object
+    with open(datafile, 'r') as f:
+        data = json.load(f)
+    # Convert the Python object to a pandas DataFrame
+    df = pd.DataFrame(data)
+
+    # Check for duplicate rows in the DataFrame
+    duplicates = df.duplicated(keep="first")
+    
+    # Remove the duplicate rows from the DataFrame
+    dedup_df = df[~duplicates]
+    
+    # Print the original and deduplicated DataFrame sizes
+    print("Original DataFrame contains {} rows".format(len(df)))
+    print("Deduplicated DataFrame contains {} rows".format(len(dedup_df)))
+    
+    # Identify the dates that were duplicated and removed
+    duplicated_dates = df[duplicates]['date'].unique()
+    print("Dates that were duplicated and removed:")
+    print(duplicated_dates)
+    
+    # Save the deduplicated data to a new JSON file
+    dedup_file = "hkd_historical_dedup"
+    dedup_df.to_json(dedup_file, orient="records")
+
+# If this script is run directly, call the get_data_from_file function with the specified file name
+if __name__ == "__main__":
+    datafile = "hkd_historical"
+    get_data_from_file(datafile)
--- a/public/hkd_historical_dedup
+++ b/public/hkd_historical_dedup