datos-faltantes - Duplicate (2024)

import janitorimport matplotlib.pyplot as pltimport missingnoimport numpy as npimport pandas as pdimport pyreadrimport seaborn as snsimport session_infoimport upsetplot

%run pandas-missing-extension.ipynb

%matplotlib inlinesns.set( rc={ "figure.figsize": (10, 10) })sns.set_style("whitegrid")

print( None or True, None or False, None == None, None is None, # None + True, # None / False, type(None), sep="\n")

print( np.nan or True, np.nan or False, np.nan == np.nan, np.nan is np.nan, np.nan / 2, np.nan * 7, type(np.nan), np.isnan(np.nan), sep="\n")

test_missing_df = pd.DataFrame.from_dict( data=dict( x=[0, 1, np.nan, np.nan, None], y=[0, 1, pd.NA, np.nan, None] ))test_missing_df

test_missing_df.isna()test_missing_df.isnull()

test_missing_df.x.isnull()

pd.Series([1, np.nan])

pd.Series([pd.to_datetime("2022-01-01"), np.nan])

pd.Series([-1]).isnull()

pima_indians_diabetes_url = "https://nrvis.com/data/mldata/pima-indians-diabetes.csv"

!wget -O ./data/pima-indians-diabetes.csv { pima_indians_diabetes_url } -q

diabetes_df = pd.read_csv( filepath_or_buffer="./data/pima-indians-diabetes.csv", # or pima_indians_diabetes_url sep=",", names=[ "pregnancies", "glucose", "blood_pressure", "skin_thickness", "insulin", "bmi", "diabetes_pedigree_function", "age", "outcome", ])

base_url = "https://github.com/njtierney/naniar/raw/master/data/"datasets_names = ("oceanbuoys", "pedestrian", "riskfactors")extension = ".rda"

datasets_dfs = {}for dataset_name in datasets_names: dataset_file = f"{ dataset_name }{ extension }" dataset_output_file = f"./data/{ dataset_file }" dataset_url = f"{ base_url }{ dataset_file }" !wget -q -O { dataset_output_file } { dataset_url } datasets_dfs[f"{ dataset_name }_df"] = pyreadr.read_r(dataset_output_file).get(dataset_name)datasets_dfs.keys()

locals().update(**datasets_dfs)del datasets_dfs

oceanbuoys_df.shape, pedestrian_df.shape, riskfactors_df.shape, diabetes_df.shape

riskfactors_df.info()

riskfactors_df.isna()

riskfactors_df.shaperiskfactors_df.missing.number_complete()riskfactors_df.missing.number_missing()riskfactors_df.missing.missing_variable_summary()riskfactors_df.missing.missing_variable_table()riskfactors_df.missing.missing_case_summary()riskfactors_df.missing.missing_case_table()( riskfactors_df .missing .missing_variable_span( variable="weight_lbs", span_every=50 ))( riskfactors_df .missing .missing_variable_run( variable="weight_lbs" ))

riskfactors_df.missing.number_complete()

riskfactors_df.missing.number_missing()

riskfactors_df.missing.missing_variable_summary()

riskfactors_df.missing.missing_variable_table()

riskfactors_df.missing.missing_case_summary()

riskfactors_df.missing.missing_case_table()

( riskfactors_df .missing .missing_variable_span( variable="weight_lbs", span_every=50 ))

( riskfactors_df .missing .missing_variable_run( variable="weight_lbs" ))

riskfactors_df.missing.missing_variable_plot()

riskfactors_df.missing.missing_case_plot()

( riskfactors_df .missing .missing_variable_span_plot( variable="weight_lbs", span_every=10, rot=0 ))

missingno.bar(df = riskfactors_df)

missingno.matrix(df=riskfactors_df)

( riskfactors_df .missing .missing_upsetplot( variables = None, element_size = 60 ))

common_na_strings = ( "missing", "NA", "N A", "N/A", "#N/A", "NA ", " NA", "N /A", "N / A", " N / A", "N / A ", "na", "n a", "n/a", "na ", " na", "n /a", "n / a", " a / a", "n / a ", "NULL", "null", "", "?", "*", ".",)

common_na_numbers = (-9, -99, -999, -9999, 9999, 66, 77, 88, -1)

missing_data_example_df = pd.DataFrame.from_dict( dict( x = [1, 3, "NA", -99, -98, -99], y = ["A", "N/A", "NA", "E", "F", "G"], z = [-100, -99, -98, -101, -1, -1] ))missing_data_example_df

missing_data_example_df.missing.number_missing()

missing_data_example_df.dtypes

missing_data_example_df.x.unique()

( missing_data_example_df .select_dtypes(object) .apply(pd.unique))

pd.read_csv( "./data/missing_data_enconding_example.csv", na_filter=True, na_values=[-99, -1])

( missing_data_example_df .replace( to_replace=[-99, "NA"], value=np.nan ))

( missing_data_example_df .replace( to_replace={ "x": { -99: np.nan } } ))

implicit_to_explicit_df = pd.DataFrame.from_dict( data={ "name": ["lynn", "lynn", "lynn", "zelda"], "time": ["morning", "afternoon", "night", "morning"], "value": [350, 310, np.nan, 320] })implicit_to_explicit_df

( implicit_to_explicit_df .pivot_wider( index="name", names_from="time", values_from="value" ))

( implicit_to_explicit_df .value_counts( subset=["name"] ) .reset_index(name="n") .query("n < 2"))

( implicit_to_explicit_df .complete( "name", "time", ))

( implicit_to_explicit_df # pyjanitor .complete( {"name": ["lynn", "zelda"]}, {"time": ["morning", "afternoon"]}, sort=True ))

( implicit_to_explicit_df # pyjanitor .complete( "name", "time", fill_value=np.nan ))

( implicit_to_explicit_df # pyjanitor .complete( "name", "time", fill_value=0, explicit=False ))

diabetes_df.missing.missing_variable_plot()

diabetes_df[diabetes_df.columns[1:6]] = diabetes_df[diabetes_df.columns[1:6]].replace(0, np.nan)diabetes_df.missing.missing_variable_plot()

( diabetes_df .missing.sort_variables_by_missingness() .pipe(missingno.matrix))

( diabetes_df .missing.sort_variables_by_missingness() .sort_values(by = "blood_pressure") .pipe(missingno.matrix))

( diabetes_df .missing.sort_variables_by_missingness() .sort_values("insulin") .pipe(missingno.matrix))

( riskfactors_df .isna() .replace({ False: "Not missing", True: "Missing" }) .add_suffix("_NA") .pipe( lambda shadow_matrix: pd.concat( [riskfactors_df, shadow_matrix], axis="columns" ) ))

( riskfactors_df .missing .bind_shadow_matrix(only_missing = True))

( riskfactors_df .missing .bind_shadow_matrix(only_missing=True) .groupby(["weight_lbs_NA"]) ["age"] .describe() .reset_index())

( riskfactors_df .missing .bind_shadow_matrix(only_missing=True) .pipe( lambda df: ( sns.displot( data=df, x="age", hue="weight_lbs_NA", kind="kde" ) ) ))

( riskfactors_df .missing .bind_shadow_matrix(only_missing=True) .pipe( lambda df: ( sns.boxenplot( data=df, x="weight_lbs_NA", y="age", ) ) ))

( riskfactors_df .missing .bind_shadow_matrix(only_missing=True) .pipe( lambda df: ( sns.displot( data=df, x="age", col="weight_lbs_NA", facet_kws={ "sharey": False } ) ) ))

( riskfactors_df .missing .bind_shadow_matrix(only_missing=True) .pipe( lambda df: ( sns.displot( data=df, x="age", col="marital_NA", row="weight_lbs_NA" ) ) ))

def column_fill_with_dummies( column: pd.Series, proportion_below: float=0.10, jitter: float=0.075, seed: int=42) -> pd.Series: column = column.copy(deep=True) # Extract values metadata. missing_mask = column.isna() number_missing_values = missing_mask.sum() column_range = column.max() - column.min() # Shift data column_shift = column.min() - column.min() * proportion_below # Create the "jitter" (noise) to be added around the points. np.random.seed(seed) column_jitter = (np.random.rand(number_missing_values) - 2) * column_range * jitter # Save new dummy data. column[missing_mask] = column_shift + column_jitter return column

plt.figure(figsize=(10, 10))( riskfactors_df .select_dtypes( exclude="category" ) .pipe( lambda df: df[df.columns[df.isna().any()]] ) .missing.bind_shadow_matrix(true_string=True, false_string=False) .apply( lambda column: column if "_NA" in column.name else column_fill_with_dummies(column, proportion_below=0.05, jitter=0.075) ) .assign( nullity=lambda df: df.weight_lbs_NA | df.height_inch_NA ) .pipe( lambda df: ( sns.scatterplot( data=df, x="weight_lbs", y="height_inch", hue="nullity" ) ) ))

missingno.heatmap( df=riskfactors_df)

missingno.dendrogram( df=riskfactors_df)

riskfactors_df.shape

( riskfactors_df .weight_lbs .mean())

riskfactors_df.weight_lbs.size, riskfactors_df.weight_lbs.count()

riskfactors_df.weight_lbs.mean(skipna=False)

( riskfactors_df .dropna( subset=["weight_lbs"], how="any" ) .shape)

( riskfactors_df .dropna( subset=["weight_lbs", "height_inch"], how="any" ) .shape)

( riskfactors_df .dropna( subset=["weight_lbs", "height_inch"], how="all" ) .shape)

( riskfactors_df .dropna( subset=["weight_lbs", "height_inch"], how="any" ) .select_columns(["weight_lbs", "height_inch"]) .pipe( lambda df: missingno.matrix(df) ))

( riskfactors_df .dropna( subset=["weight_lbs", "height_inch"], how="all" ) .select_columns(["weight_lbs", "height_inch"]) .pipe( lambda df: missingno.matrix(df) ))

implicit_to_explicit_df = pd.DataFrame( data={ "name": ["lynn", np.nan, "zelda", np.nan, "shadowsong", np.nan], "time": ["morning", "afternoon", "morning", "afternoon", "morning", "afternoon",], "value": [350, 310, 320, 350, 310, 320] })implicit_to_explicit_df

implicit_to_explicit_df.ffill()

plt.figure(figsize=(10, 10))( riskfactors_df .select_columns("weight_lbs", "height_inch", "bmi") .missing.bind_shadow_matrix(true_string=True, false_string=False) .apply( axis="rows", func=lambda column: column.fillna(column.mean()) if "_NA" not in column.name else column ) .pipe( lambda df: ( sns.displot( data=df, x="weight_lbs", hue="weight_lbs_NA" ) ) ))

plt.figure(figsize=(10, 10))( riskfactors_df .select_columns("weight_lbs", "height_inch", "bmi") .missing.bind_shadow_matrix(true_string=True, false_string=False) .apply( axis="rows", func=lambda column: column.fillna(column.mean()) if "_NA" not in column.name else column ) .assign( imputed=lambda df: df.weight_lbs_NA | df.height_inch_NA ) .pipe( lambda df: ( sns.scatterplot( data=df, x="weight_lbs", y="height_inch", hue="imputed" ) ) ))

plt.figure(figsize=(10, 10))( riskfactors_df .select_columns("weight_lbs", "height_inch", "bmi") .missing.bind_shadow_matrix(true_string=True, false_string=False) .apply( axis="rows", func=lambda column: column.fillna(column.mean()) if "_NA" not in column.name else column, ) .pivot_longer( index="*_NA" ) .pivot_longer( index=["variable", 'value'], names_to="variable_NA", values_to="value_NA" ) .assign( valid=lambda df: df.apply(axis="columns", func=lambda column: column.variable in column.variable_NA) ) .query("valid") .pipe( lambda df: ( sns.displot( data=df, x="value", hue="value_NA", col="variable", common_bins=False, facet_kws={ "sharex": False, "sharey": False } ) ) ))

session_info.show()

datos-faltantes - Duplicate (2024)

FAQs

How much missing data is acceptable? ›

How much data is missing? The overall percentage of data that is missing is important. Generally, if less than 5% of values are missing then it is acceptable to ignore them (REF).

How to handle data missing not at random? ›

5. Decide how to handle missing data
  1. Delete the variable with the missing data.
  2. Delete the cases with the missing data.
  3. Impute (fill in) the missing data.
  4. Model the missing data.

How to handle missing data imputation? ›

Mean/Median/Mode Imputation: Replace missing entries with the average (mean), middle value (median), or most frequent value (mode) of the corresponding column. This is a quick and easy approach, but it can introduce bias if the missing data is not randomly distributed.

How to handle missing data in regression analysis? ›

1 Complete Case Analysis

One of the simplest ways to deal with missing data is to exclude any observations that have missing values in any of the variables involved in the regression. This is called complete case analysis, or listwise deletion.

What is the 10 rule for missing data? ›

A working rule is, when 10% of the cases is missing in a multivariable model, 10 imputed datasets have to be generated, when 15% is missing, 15 imputed datasets, etc. After imputation the statistical analysis has to be repeated in each multiply imputed dataset.

What is the 5 percent rule for missing data? ›

However, if more than 5% of the data is missing, deleting the missing data will result in a reduced sample size and an increased standard error of the parameter estimates. In this case, it is strongly suggested to use imputation of the mean, median or mode, or multiple imputation, to fill in the missing data.

What are the 4 types of missing data? ›

There are four qualitatively distinct types of missing data. Missing data is either: structurally missing, missing completely at random (MCAR), missing at random, or nonignorable (also known as missing not at random).

What and are two common techniques to handle missing data? ›

These approaches can be adopted to deal with missing values: mean, median, mode imputation, random sample imputation, and multiple imputations. The list is not exhaustive, and each of these methods has pros and cons.

What percentage of missing values should be dropped? ›

It is totally dependent upon the problem at hand, But in general, if you have a high percentage say 40-50% of data missing and you don't have much data then dropping the column would be good. Or else you can impute the data with mean, mode, or Other imputation techniques like using KNN, or Regression.

How do you resolve missing data? ›

When dealing with missing data, data scientists can use two primary methods to solve the error: imputation or data removal. The imputation method substitutes reasonable guesses for missing data. It's most useful when the percentage of missing data is low.

How do you reduce missing data? ›

How to reduce the risk for missing data in study design
  1. Easy accessible data. ...
  2. Only collect the information that is absolutely essential for your study. ...
  3. Mandatory fields. ...
  4. Data validation. ...
  5. Include an option for “not applicable”. ...
  6. Easy to read and understand the CRF. ...
  7. Make a pilot. ...
  8. Choose external data sources carefully.

What is the best imputation method? ›

The simplest imputation method is replacing missing values with the mean or median values of the dataset at large, or some similar summary statistic. This has the advantage of being the simplest possible approach, and one that doesn't introduce any undue bias into the dataset.

What is the approach to dealing with missing data? ›

The simplest method to deal with missing data is data reduction which deletes the instances with missing values. However it will lead to great information loss. – Someone forgot to write down a number, to fill in a questionnaire item, etc.

What is the best way to handle missing data for categorical data? ›

Deal with missing values in Categorical Features
  1. 1st Model: Delete the entire column maker.
  2. 2nd Model: Replace missing values with the most frequent values.
  3. 3rd Model: Delete rows with null values.
  4. 4th Model: Predict the missing values with the RandomForestClassifier.
Nov 17, 2020

Which functions is used to manage missing data? ›

Methods for Identifying Missing Data
FunctionsDescriptions
.isnull()Identifies missing values in a Series or DataFrame.
.notnull()check for missing values in a pandas Series or DataFrame. It returns a boolean Series or DataFrame, where True indicates non-missing values and False indicates missing values.
7 more rows
Feb 28, 2024

What is the tolerance for missing data? ›

The tolerance range for missing data in a dataset depends on the proportion of missing values and the impact they may have on the study outcomes. If missing values represent less than 5% of the total values, they can be accommodated without significant impact on the results .

How much data loss is acceptable? ›

A company's RPO is the maximum amount of data loss it considers acceptable when a failure or outage occurs. RPO is typically measured in units of time. For example, a company with an RPO of ten minutes has decided that in the event of an outage, it can afford to lose up to ten minutes of data (lost transactions, etc.)

What if there is more than 50% missing values? ›

When dealing with large missing values, particularly when they exceed 50% of the dataset, several strategies can be employed in machine learning such as Imputation, If the missing values are not randomly distributed, imputation methods such as mean, median, mode imputation, or more sophisticated techniques like K- ...

What is the rule of thumb for missing values? ›

While there is no absolute cutoff for the number of missing data values you should attempt to fill in, a common rule of thumb is to fill in no more than 5 percent of the values in a dataset. Finally, determine where the missing values are located.

Top Articles
Latest Posts
Article information

Author: Dan Stracke

Last Updated:

Views: 6296

Rating: 4.2 / 5 (63 voted)

Reviews: 94% of readers found this page helpful

Author information

Name: Dan Stracke

Birthday: 1992-08-25

Address: 2253 Brown Springs, East Alla, OH 38634-0309

Phone: +398735162064

Job: Investor Government Associate

Hobby: Shopping, LARPing, Scrapbooking, Surfing, Slacklining, Dance, Glassblowing

Introduction: My name is Dan Stracke, I am a homely, gleaming, glamorous, inquisitive, homely, gorgeous, light person who loves writing and wants to share my knowledge and understanding with you.