Skip to content

Commit

Permalink
More standard standardization. Still [-1, 1]
Browse files Browse the repository at this point in the history
  • Loading branch information
Emily committed Nov 2, 2023
1 parent d8d66b5 commit 46f26b8
Show file tree
Hide file tree
Showing 62 changed files with 1,222,934 additions and 1,222,947 deletions.
1 change: 0 additions & 1 deletion cities/queries/fips_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,6 @@ def plot_kins_other_var(self, var):
: self.top
]


for i, geoname in enumerate(geonames_top):
subset = others_plot_data[others_plot_data["GeoName"] == geoname]
# print("subset.head")
Expand Down
48 changes: 18 additions & 30 deletions cities/utils/cleaning_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,24 @@ def standardize_and_scale(data: pd.DataFrame) -> pd.DataFrame:
"""
Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns. Returns a new DataFrame.
"""
standard_scaler = StandardScaler()

new_data = pd.DataFrame()
for column in data.columns:
if data.dtypes[column] != "float64":
new_data[column] = data[column].copy()
else:
new = data[column].copy().values.reshape(-1, 1)
new = standard_scaler.fit_transform(new)

positive_mask = new >= 0
negative_mask = new < 0

min_positive = np.min(new[positive_mask])
max_positive = np.max(new[positive_mask])
scaled_positive = (new[positive_mask] - min_positive) / (
max_positive - min_positive
)

min_negative = np.min(new[negative_mask])
max_negative = np.max(new[negative_mask])
scaled_negative = (new[negative_mask] - min_negative) / (
max_negative - min_negative
) - 1

scaled_values = np.empty_like(new, dtype=float)
scaled_values[positive_mask] = scaled_positive
scaled_values[negative_mask] = scaled_negative

new_data[column] = scaled_values.reshape(-1)
standard_scaler = StandardScaler() # Standardize to mean 0, std 1

def sigmoid(x, scale=1 / 3):
range_0_1 = 1 / (1 + np.exp(-x * scale))
range_minus1_1 = 2 * range_0_1 - 1
return range_minus1_1

# Copy all columns first
new_data = data.copy()

# Select float columns
float_cols = data.select_dtypes(include=["float64"])

# Standardize float columns to mean 0, std 1
standardized_floats = standard_scaler.fit_transform(float_cols)

# Apply sigmoid transformation, [-3std, 3std] to [-1, 1]
new_data[float_cols.columns] = sigmoid(standardized_floats, scale=1 / 3)

return new_data

Expand Down
61,480 changes: 30,740 additions & 30,740 deletions data/processed/ethnic_composition_std_long.csv

Large diffs are not rendered by default.

6,148 changes: 3,074 additions & 3,074 deletions data/processed/ethnic_composition_std_wide.csv

Large diffs are not rendered by default.

Loading

0 comments on commit 46f26b8

Please sign in to comment.