00001
00002 """
00003 Created on Thu Dec 29 15:24:08 2011
00004
00005 @author: Sat Kumar Tomer
00006 @website: www.ambhas.com
00007 @email: satkumartomer@gmail.com
00008 """
00009
00010 from __future__ import division
00011 import numpy as np
00012 import statistics as st
00013 from scipy.interpolate import interp1d
00014 from scipy.stats import norm, chi2
00015 from scipy.stats import scoreatpercentile
00016
00017 def bias_correction(oc, mc, mp):
00018 """
00019 Input:
00020 oc: observed current
00021 mc: modeled current
00022 mp: modeled prediction
00023
00024 Output:
00025 mp_adjusted: adjusted modeled prediction
00026
00027
00028 """
00029
00030
00031 oc = oc.flatten()
00032 mc = mc.flatten()
00033 mp = mp.flatten()
00034
00035
00036
00037
00038 F_oc, OC = st.cpdf(oc, n=1000)
00039 f = interp1d(F_oc, OC)
00040
00041 F1 = st.cpdf(mc, mp)
00042 mp_adjusted = f(F1)
00043
00044 return mp_adjusted
00045
00046
00047 def mk_test(x, alpha = 0.05):
00048 """
00049 this perform the MK (Mann-Kendall) test to check if there is any trend present in
00050 data or not
00051
00052 Input:
00053 x: a vector of data
00054 alpha: significance level
00055
00056 Output:
00057 trend: tells the trend (increasing, decreasing or no trend)
00058 h: True (if trend is present) or False (if trend is absence)
00059 p: p value of the sifnificance test
00060 z: normalized test statistics
00061
00062 Examples
00063 --------
00064 >>> x = np.random.rand(100)
00065 >>> trend,h,p,z = mk_test(x,0.05)
00066 """
00067 n = len(x)
00068
00069
00070 s = 0
00071 for k in xrange(n-1):
00072 for j in xrange(k+1,n):
00073 s += np.sign(x[j] - x[k])
00074
00075
00076 unique_x = np.unique(x)
00077 g = len(unique_x)
00078
00079
00080 if n == g:
00081 var_s = (n*(n-1)*(2*n+5))/18
00082 else:
00083 tp = np.zeros(unique_x.shape)
00084 for i in xrange(len(unique_x)):
00085 tp[i] = sum(unique_x[i] == x)
00086 var_s = (n*(n-1)*(2*n+5) + np.sum(tp*(tp-1)*(2*tp+5)))/18
00087
00088 if s>0:
00089 z = (s - 1)/np.sqrt(var_s)
00090 elif s == 0:
00091 z = 0
00092 elif s<0:
00093 z = (s + 1)/np.sqrt(var_s)
00094
00095
00096 p = 2*(1-norm.cdf(abs(z)))
00097 h = abs(z) > norm.ppf(1-alpha/2)
00098
00099 if (z<0) and h:
00100 trend = 'decreasing'
00101 elif (z>0) and h:
00102 trend = 'increasing'
00103 else:
00104 trend = 'no trend'
00105
00106 return trend, h, p, z
00107
00108 def independant(x,y, alpha = 0.05):
00109 """
00110 this program calculates check if the joint cdf == multiplication of marginal
00111 distribution or not
00112 using the chi-squared test
00113
00114 Input:
00115 x: a vector of data
00116 y: a vector of data
00117 alpha: significance level
00118
00119 Output:
00120 ind: True (if independant) False (if dependant)
00121 p: p value of the significance test
00122
00123 Examples
00124 --------
00125 >>> x = np.random.rand(100)
00126 >>> y = np.random.rand(100)
00127 >>> ind,p = independant(x,y,0.05)
00128 """
00129
00130
00131 H, xedges, yedges = np.histogram2d(x, y, bins=5)
00132
00133
00134 expected_values = np.zeros(H.shape)
00135 for i in range(H.shape[0]):
00136 for j in range(H.shape[1]):
00137 expected_values[i,j] = H.sum(axis=1)[i]*H.sum(axis=0)[j]/H.sum()
00138
00139
00140 err_chi2 = ((H-expected_values)**2/expected_values).sum()
00141
00142
00143 dof = (H.shape[0]-1)*(H.shape[1]-1)
00144
00145
00146 rv = chi2(dof)
00147 p = 2*(1-rv.sf(err_chi2))
00148
00149
00150 ind = p >= alpha
00151
00152 return ind, p
00153
00154
00155 class SpatOutlier():
00156 """
00157 this class identify the outliers from the given spatial data of point values
00158 """
00159
00160 def __init__(self,rain):
00161 """
00162 Input:
00163 rain: rain at different spatial locations and time
00164 time ==> is defined in the first dimension
00165 space ==> is defined in the second dimension
00166 """
00167
00168 if rain.ndim > 2:
00169 raise ValueError('The dimension of the input should be less than or equal to 2 (two)')
00170 elif rain.ndim == 1:
00171 rain.shape = (1,-1)
00172 self.rain = rain
00173
00174 def _identify_outlier(self,threshold=2.0):
00175 """
00176 Input:
00177 threshold: threshold above which the data will be termed as outlier
00178 """
00179 rain = self.rain
00180 q_25 = scoreatpercentile(rain.T,25)
00181 q_75 = scoreatpercentile(rain.T,75)
00182 q_50 = scoreatpercentile(rain.T,50)
00183
00184 q_25_m = np.tile(q_25,(rain.shape[1],1)).T
00185 q_50_m = np.tile(q_50,(rain.shape[1],1)).T
00186 q_75_m = np.tile(q_75,(rain.shape[1],1)).T
00187
00188 index = np.abs(rain-q_50_m)/(q_75_m-q_25_m)
00189 self.index = index
00190
00191 self.outliers = index>=threshold
00192
00193 def fill_with_nan(self):
00194 """
00195 this method fills the outliers with the nan
00196
00197 Output:
00198 rain_filled: rain filled with nan where outliers were present
00199 """
00200 self._identify_outlier()
00201
00202 rain_filled = self.rain
00203 rain_filled[self.outliers] = np.nan
00204 return rain_filled
00205
00206 if __name__ == "__main__":
00207 oc = np.random.randn(100)
00208 mc = 2+np.random.randn(100)
00209 mp = 2+np.random.randn(1000)
00210
00211 print("mean of observed current is %f"%oc.mean())
00212 print("mean of modeled current is %f"%mc.mean())
00213 print("mean of modeled prediction is %f"%mp.mean())
00214
00215 mp_adjusted = bias_correction(oc, mc, mp)
00216 print("mean of adjusted modeled prediction is %f"%mp_adjusted.mean())
00217
00218
00219 x = np.random.randn(5,20)
00220 x[4,4] = 2.9
00221 foo = SpatOutlier(x)
00222 x1 = foo.fill_with_nan()
00223 print x1[4,4]
00224
00225