Issue #3 clean up data in historical_hkd file

new file:   ../docs/dedup.md
new file:   dedup.py
new file:   hkd_historical_dedup
This commit is contained in:
bitc0indad
2023-05-14 22:02:18 -04:00
parent c8b4545f68
commit b36136b087
3 changed files with 107 additions and 0 deletions

35
public/dedup.py Normal file
View File

@@ -0,0 +1,35 @@
# Import the pandas and json libraries
import pandas as pd
import json
# Define a function that reads in data from a JSON file, converts it to a DataFrame, removes duplicates, and saves the deduplicated data to a new file
def get_data_from_file(datafile):
# Open the specified JSON file and load its contents into a Python object
with open(datafile, 'r') as f:
data = json.load(f)
# Convert the Python object to a pandas DataFrame
df = pd.DataFrame(data)
# Check for duplicate rows in the DataFrame
duplicates = df.duplicated(keep="first")
# Remove the duplicate rows from the DataFrame
dedup_df = df[~duplicates]
# Print the original and deduplicated DataFrame sizes
print("Original DataFrame contains {} rows".format(len(df)))
print("Deduplicated DataFrame contains {} rows".format(len(dedup_df)))
# Identify the dates that were duplicated and removed
duplicated_dates = df[duplicates]['date'].unique()
print("Dates that were duplicated and removed:")
print(duplicated_dates)
# Save the deduplicated data to a new JSON file
dedup_file = "hkd_historical_dedup"
dedup_df.to_json(dedup_file, orient="records")
# If this script is run directly, call the get_data_from_file function with the specified file name
if __name__ == "__main__":
datafile = "hkd_historical"
get_data_from_file(datafile)