mirror of
https://github.com/aljazceru/satshkd-vercel.git
synced 2025-12-17 05:04:24 +01:00
Issue #3 clean up data in historical_hkd file
new file: ../docs/dedup.md new file: dedup.py new file: hkd_historical_dedup
This commit is contained in:
35
public/dedup.py
Normal file
35
public/dedup.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# Import the pandas and json libraries
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
# Define a function that reads in data from a JSON file, converts it to a DataFrame, removes duplicates, and saves the deduplicated data to a new file
|
||||
def get_data_from_file(datafile):
|
||||
# Open the specified JSON file and load its contents into a Python object
|
||||
with open(datafile, 'r') as f:
|
||||
data = json.load(f)
|
||||
# Convert the Python object to a pandas DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Check for duplicate rows in the DataFrame
|
||||
duplicates = df.duplicated(keep="first")
|
||||
|
||||
# Remove the duplicate rows from the DataFrame
|
||||
dedup_df = df[~duplicates]
|
||||
|
||||
# Print the original and deduplicated DataFrame sizes
|
||||
print("Original DataFrame contains {} rows".format(len(df)))
|
||||
print("Deduplicated DataFrame contains {} rows".format(len(dedup_df)))
|
||||
|
||||
# Identify the dates that were duplicated and removed
|
||||
duplicated_dates = df[duplicates]['date'].unique()
|
||||
print("Dates that were duplicated and removed:")
|
||||
print(duplicated_dates)
|
||||
|
||||
# Save the deduplicated data to a new JSON file
|
||||
dedup_file = "hkd_historical_dedup"
|
||||
dedup_df.to_json(dedup_file, orient="records")
|
||||
|
||||
# If this script is run directly, call the get_data_from_file function with the specified file name
|
||||
if __name__ == "__main__":
|
||||
datafile = "hkd_historical"
|
||||
get_data_from_file(datafile)
|
||||
Reference in New Issue
Block a user