Outliers
Outliers are elements that:
have characteristic(s) significantly different than other elements in a grouping
may or may not be desired as part of the group
can occur due to factors such as:
chance
measurement errors
heavily skewed group population
the mixture of different group populations
normal occurrence within a group
Outlier data points are illustrated below:
Python Example
The illustration above is generated by this code example.
To download the code below click here.
""" outlier_detection.py generates and displays outlier data """ # Import needed libraries. import numpy as np import matplotlib.pyplot as plotlib from sklearn.neighbors import LocalOutlierFactor # Define parameters. data_points_first_dimension = 150 data_points_second_dimension = 2 potential_outliers_low_value = -5 potential_outliers_high_value = 5 potential_outliers_first_dimension = 20 potential_outliers_second_dimension = 2 outliers_proportion = 0.1 outlier_identification_limit = 0.3 number_of_nearest_neighbors = 20 plot_maximum = 6 plot_minimum = -6 data_point_plot_size = 3 data_point_plot_color = 'k' data_point_legend_size = 10 data_point_legend_title = 'Data Points' outlier_plot_edge_color = 'blue' outlier_plot_fill_color = 'none' outlier_plot_size_multiplier = 500 outlier_legend_size = 30 outlier_legend_title = 'Outliers' legend_location = 'best' # Generate two dimensional random data points. data_points = np.random.randn( data_points_first_dimension, data_points_second_dimension) # Generate potential outliers. potential_outliers = np.random.uniform( low=potential_outliers_low_value, high=potential_outliers_high_value, size=(potential_outliers_first_dimension, potential_outliers_second_dimension)) # Create a joint data points and outliers array. all_data = np.r_[data_points, potential_outliers] # Instantiate a model. model = LocalOutlierFactor( n_neighbors=number_of_nearest_neighbors, contamination=outliers_proportion) # Train the model. model.fit_predict(all_data) # Get the distance scores. scores = model.negative_outlier_factor_ print('Distance Scores:') print(scores) # Calculate the outlier magnitudes. outlier_magnitudes = (scores.max() - scores) / (scores.max() - scores.min()) print('Outlier Magnitudes') print(outlier_magnitudes) # Modify outlier magnitudes to exclude non-outliers. value_index = 0 for value in outlier_magnitudes: if value < outlier_identification_limit: outlier_magnitudes[value_index] = 0.0 value_index += 1 # Plot the data. plotlib.scatter( all_data[:, 0], all_data[:, 1], color=data_point_plot_color, s=data_point_plot_size, label=data_point_legend_title) # Plot outlier scores. plotlib.scatter( all_data[:, 0], all_data[:, 1], s=outlier_plot_size_multiplier * outlier_magnitudes, edgecolors=outlier_plot_edge_color, facecolors=outlier_plot_fill_color, label=outlier_legend_title) # Set plot parameters. plotlib.xlim((plot_minimum, plot_maximum)) plotlib.ylim((plot_minimum, plot_maximum)) legend = plotlib.legend(loc=legend_location) legend.legendHandles[0]._sizes = [data_point_legend_size] legend.legendHandles[1]._sizes = [outlier_legend_size] # Display the plot. plotlib.show()
Output is displayed below:
Distance Scores:
[-1.03093045 -0.97025576 -0.9789199 -1.22718898 -1.27367367 -0.97975501
-1.21533808 -1.00988116 -1.02548226 -1.025744 -1.35310946 -1.16465953
-0.9510596 -1.32006976 -0.99420069 -1.1913924 -1.3004024 -1.05842189
-1.08408608 -1.42826986 -0.98633271 -1.69829943 -1.16546652 -1.31090524
-1.55808156 -1.18331503 -0.98807644 -1.64977176 -1.15588476 -1.01462976
-0.98982932 -1.03790483 -1.20447146 -1.34781944 -1.2091296 -1.63144386
-0.96550363 -0.97216442 -0.95213817 -1.24991303 -1.16191968 -1.41560363
-1.00864374 -1.07066267 -1.20780501 -1.12826381 -1.44469637 -0.99347285
-0.97226684 -1.0571796 -1.03506519 -1.15569389 -0.9977228 -1.34467933
-1.27038252 -1.71304947 -1.06440292 -0.98689587 -0.96650836 -0.97866635
-1.2314442 -1.01245056 -1.16271364 -0.9897751 -1.21030371 -1.32488364
-0.99216012 -0.97737657 -0.97846065 -1.15032637 -1.06060304 -1.19164428
-1.34915758 -0.97250767 -0.99714778 -1.1537212 -1.14219448 -0.99217374
-1.16784187 -1.01894702 -0.99534788 -1.23918424 -1.53565746 -0.9779416
-0.97903216 -1.44793516 -1.06830985 -1.33200092 -1.38406044 -0.97202706
-0.98077415 -1.05243228 -0.95718849 -1.13566501 -1.10095812 -0.97621319
-1.17810311 -1.88736652 -1.01340382 -1.275288 -0.98474142 -1.08752184
-1.09544976 -1.27247752 -0.95919331 -1.11981845 -1.09859267 -0.99629938
-0.98216839 -0.9686214 -0.98396905 -1.06346488 -0.99863583 -0.98767859
-1.09441195 -1.11718569 -1.76095356 -1.00159203 -1.27144958 -1.40613554
-1.86014928 -0.98707618 -1.02529512 -1.09964265 -1.38751246 -0.99082823
-1.10003713 -0.98315692 -0.97268895 -1.00742788 -1.15342712 -1.17502979
-1.02641112 -1.33913601 -1.06224209 -1.08605004 -1.0222628 -1.03701658
-1.04040942 -2.07409729 -0.97115055 -0.97948049 -1.07748516 -1.13111777
-1.47427642 -1.11833171 -1.32761747 -1.00165439 -1.02760137 -1.22029597
-2.14287251 -2.48384305 -3.22564161 -1.32583234 -1.80993966 -2.66458652
-2.01119781 -1.62400676 -3.09748357 -1.10418782 -1.83701673 -2.29961671
-1.99827914 -1.69441379 -1.66574954 -1.43739056 -3.27846442 -2.62429321
-2.42623938 -1.62578036]
Outlier Magnitudes
[3.43175570e-02 8.24788049e-03 1.19705423e-02 1.18642607e-01
1.38615363e-01 1.23293589e-02 1.13550713e-01 2.52734512e-02
3.19766695e-02 3.20891306e-02 1.72745992e-01 9.17760105e-02
0.00000000e+00 1.58550054e-01 1.85361340e-02 1.03262136e-01
1.50099713e-01 4.61296157e-02 5.71565710e-02 2.05039646e-01
1.51555510e-02 3.21061387e-01 9.21227422e-02 1.54612398e-01
2.60814944e-01 9.97915891e-02 1.59047691e-02 3.00210839e-01
8.80058135e-02 2.73137527e-02 1.66579155e-02 3.73141907e-02
1.08881726e-01 1.70473067e-01 1.10883157e-01 2.92336018e-01
6.20606557e-03 9.06796283e-03 4.63420636e-04 1.28406295e-01
9.05987949e-02 1.99597430e-01 2.47417788e-02 5.13890272e-02
1.10314028e-01 7.61381100e-02 2.12097508e-01 1.82234069e-02
9.11196598e-03 4.55958475e-02 3.60941022e-02 8.79238033e-02
2.00494548e-02 1.69123877e-01 1.37201276e-01 3.27398939e-01
4.86994409e-02 1.53975195e-02 6.63776241e-03 1.18616005e-02
1.20470917e-01 2.63774271e-02 9.09399298e-02 1.66346193e-02
1.11387629e-01 1.60618398e-01 1.76593735e-02 1.13074284e-02
1.17732183e-02 8.56175782e-02 4.70667739e-02 1.03370361e-01
1.71048015e-01 9.21544146e-03 1.98023891e-02 8.70762115e-02
8.21236050e-02 1.76652278e-02 9.31433423e-02 2.91687198e-02
1.90290381e-02 1.23796526e-01 2.51180136e-01 1.15502026e-02
1.20187740e-02 2.13489099e-01 5.03781039e-02 1.63676432e-01
1.86044485e-01 9.00894243e-03 1.27672431e-02 4.35561001e-02
2.63335616e-03 7.93181322e-02 6.44058608e-02 1.08075664e-02
9.75522180e-02 4.02296545e-01 2.67870090e-02 1.39308981e-01
1.44718320e-02 5.86327879e-02 6.20391246e-02 1.38101421e-01
3.49475319e-03 7.25094524e-02 6.33895169e-02 1.94378644e-02
1.33662970e-02 7.54565404e-03 1.41399734e-02 4.82963991e-02
2.04417506e-02 1.57338284e-02 6.15932140e-02 7.13782518e-02
3.47981559e-01 2.17119177e-02 1.37659753e-01 1.95529344e-01
3.90602301e-01 1.54749925e-02 3.18962624e-02 6.38406554e-02
1.87527692e-01 1.70871116e-02 6.40101458e-02 1.37910309e-02
9.29333142e-03 2.42193695e-02 8.69498559e-02 9.62317278e-02
3.23757666e-02 1.66742118e-01 4.77710137e-02 5.80004113e-02
3.05933888e-02 3.69325400e-02 3.83903191e-02 4.82527868e-01
8.63233981e-03 1.22114062e-02 5.43203984e-02 7.73643522e-02
2.24806965e-01 7.18706524e-02 1.61793024e-01 2.17387127e-02
3.28871723e-02 1.15680934e-01 5.12078042e-01 6.58580507e-01
9.77303987e-01 1.61026022e-01 3.69029079e-01 7.36239309e-01
4.55502283e-01 2.89140569e-01 9.22239203e-01 6.57935457e-02
3.80663096e-01 5.79425245e-01 4.49951608e-01 3.19391871e-01
3.07075904e-01 2.08958473e-01 1.00000000e+00 7.18926760e-01
6.33830333e-01 2.89902619e-01]