STK++ 0.9.13
STK_Stat_UnivariateReal.h
Go to the documentation of this file.
1/*--------------------------------------------------------------------*/
2/* Copyright (C) 2004-2016 Serge Iovleff, Université Lille 1, Inria
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this program; if not, write to the
16 Free Software Foundation, Inc.,
17 59 Temple Place,
18 Suite 330,
19 Boston, MA 02111-1307
20 USA
21
22 Contact : S..._Dot_I..._At_stkpp_Dot_org (see copyright for ...)
23*/
24
25/*
26 * Project: stkpp::STatistiK::StatDesc
27 * Purpose: Compute elementary 1D statistics for all variables.
28 * Author: Serge Iovleff, S..._Dot_I..._At_stkpp_Dot_org (see copyright for ...)
29 **/
30
36#ifndef STK_STAT_UNIVARIATEREAL_H
37#define STK_STAT_UNIVARIATEREAL_H
38
41
42namespace STK
43{
44namespace Stat
45{
46
47template < class TContainer1D, typename Type = typename TContainer1D::Type >
49
62template < class TContainer1D>
63class Univariate<TContainer1D, Real>
64{
65 public:
72 Univariate( TContainer1D const& V, bool sorted = false)
73 : nbSamples_(V.size())
74 , nbAvailable_(V.size())
75 , nbMiss_(0)
76 , V_(V)
77 , W_()
78 , weighted_(false)
79 , sorted_(sorted)
80 , comporder_(false)
81 , compstat_(false)
82 , sumweights_(0.)
83 , sum2weights_(0.)
84 , mean_(Arithmetic<Real>::NA())
85 , median_(Arithmetic<Real>::NA())
86 , var_(Arithmetic<Real>::NA())
87 , uvar_(Arithmetic<Real>::NA())
88 , std_(Arithmetic<Real>::NA())
89 , ustd_(Arithmetic<Real>::NA())
90 , mad_(Arithmetic<Real>::NA())
91 , kurtosis_(Arithmetic<Real>::NA())
92 , skewness_(Arithmetic<Real>::NA())
93 , quartiles_(3, Arithmetic<Real>::NA(), String(_T("Quartiles")))
94 , deciles_(9, Arithmetic<Real>::NA(), String(_T("Deciles")))
95 , viceciles_(19, Arithmetic<Real>::NA(), String(_T("Viceciles")))
96 , percentiles_(99, Arithmetic<Real>::NA(), String(_T("Percentiles")))
97 {
99 initializeVariable();
100 compOrderStatistics();
101 compStatistics();
102 }
103
111 Univariate( TContainer1D const& V, TContainer1D const& W, bool sorted = false)
112 : nbSamples_(V.size())
113 , nbAvailable_(V.size())
114 , nbMiss_(0)
115 , V_(V)
116 , W_(W)
117 , weighted_(true)
118 , sorted_(sorted)
119 , comporder_(false)
120 , compstat_(false)
121 , sumweights_(0.0)
122 , sum2weights_(0.0)
123 , mean_(Arithmetic<Real>::NA())
124 , median_(Arithmetic<Real>::NA())
125 , var_(Arithmetic<Real>::NA())
126 , uvar_(Arithmetic<Real>::NA())
127 , std_(Arithmetic<Real>::NA())
128 , ustd_(Arithmetic<Real>::NA())
129 , mad_(Arithmetic<Real>::NA())
130 , kurtosis_(Arithmetic<Real>::NA())
131 , skewness_(Arithmetic<Real>::NA())
132 , quartiles_(3)
133 , deciles_(9)
134 , viceciles_(19)
135 , percentiles_(99)
136 {
138 // check weights size
139 if ((V_.range() != W_.range()))
140 { STKRUNTIME_ERROR_NO_ARG(Univariate::Univariate(V, W, sorted),V and W have different range);}
141 initializeVariableAndWeights();
142 compOrderStatistics();
143 compWeightedStatistics();
144 }
149 : nbSamples_(stat.nbSamples_)
150 , nbAvailable_(stat.nbAvailable_)
151 , nbMiss_(stat.nbMiss_)
152 , V_(stat.V_)
153 , W_(stat.W_)
154 , weighted_(stat.weighted_)
155 , sorted_(stat.sorted_)
156 , comporder_(stat.comporder_)
157 , compstat_(stat.compstat_)
158 , sumweights_(stat.sumweights_)
159 , sum2weights_(stat.sum2weights_)
160 , min_(stat.min_)
161 , max_(stat.max_)
162 , amax_(stat.amax_)
163 , mean_(stat.mean_)
164 , median_(stat.median_)
165 , var_(stat.var_)
166 , uvar_(stat.uvar_)
167 , std_(stat.std_)
168 , ustd_(stat.ustd_)
169 , mad_(stat.mad_)
170 , kurtosis_(stat.kurtosis_)
171 , skewness_(stat.skewness_)
172 , quartiles_(stat.quartiles_)
173 , deciles_(stat.deciles_)
174 , viceciles_(stat.viceciles_)
175 , percentiles_(stat.percentiles_)
176 {}
177
180
183 {
184 nbSamples_ = stat.nbSamples_;
185 nbAvailable_ = stat.nbAvailable_;
186 nbMiss_ = stat.nbMiss_;
187 V_ = stat.V_;
188 W_ = stat.W_;
189 weighted_ = stat.weighted_;
190 sorted_ = stat.sorted_;
191 comporder_ = stat.comporder_;
192 compstat_ = stat.compstat_;
193 sumweights_ = stat.sumweights_;
194 sum2weights_ = stat.sum2weights_;
195 min_ = stat.min_;
196 max_ = stat.max_;
197 amax_ = stat.amax_;
198 mean_ = stat.mean_;
199 median_ = stat.median_;
200 var_ = stat.var_;
201 uvar_ = stat.uvar_;
202 std_ = stat.std_;
203 ustd_ = stat.ustd_;
204 mad_ = stat.mad_;
205 kurtosis_ = stat.kurtosis_;
206 skewness_ = stat.skewness_;
207 quartiles_ = stat.quartiles_;
208 deciles_ = stat.deciles_;
209 viceciles_ = stat.viceciles_;
210 percentiles_ = stat.percentiles_;
211
212 return *this;
213 }
214
219 template<class Vector>
220 void setData( Vector const& V, bool sorted = false)
221 {
223 // set variable and weights
224 V_ = V;
225 W_.clear(); // no weights
226 // initialize dimensions
227 nbSamples_ = V_.size();
228 nbAvailable_ = V_.size();
229 nbMiss_ = 0;
230 // initialize data
231 weighted_ = false;
232 sorted_ = sorted;
233 comporder_ = false;
234 compstat_ = false;
235 sumweights_ = 0.0;
236 sum2weights_ = 0.0;
237 mean_ = Arithmetic<Real>::NA();
238 median_ = Arithmetic<Real>::NA();
239 var_ = Arithmetic<Real>::NA();
240 uvar_ = Arithmetic<Real>::NA();
241 std_ = Arithmetic<Real>::NA();
242 ustd_ = Arithmetic<Real>::NA();
243 mad_ = Arithmetic<Real>::NA();
244 kurtosis_ = Arithmetic<Real>::NA();
245 skewness_ = Arithmetic<Real>::NA();
246 quartiles_ = Arithmetic<Real>::NA();
247 deciles_ = Arithmetic<Real>::NA();
248 viceciles_ = Arithmetic<Real>::NA();
249 percentiles_ = Arithmetic<Real>::NA();
250
251 initializeVariable();
252 compOrderStatistics();
253 compStatistics();
254 }
255
260 template<class Vector, class Weights>
261 void setData( Vector const& V, Weights const& W, bool sorted = false)
262 {
265 // check range size
266 if ((V.range() != W.range()))
267 { STKRUNTIME_ERROR_NO_ARG(Univariate::setData(V, W, sorted),V and W have different range);}
268 // set variable and weights
269 V_ = V;
270 W_ = W;
271 // initialize dimensions
272 nbSamples_ = V_.size();
273 nbAvailable_ = V_.size();
274 nbMiss_ = 0;
275 // initialize data
276 weighted_ = true;
277 sorted_ = sorted;
278 comporder_ = false;
279 compstat_ = false;
280 sumweights_ = 0.0;
281 sum2weights_ = 0.0;
282 mean_ = Arithmetic<Real>::NA();
283 median_ = Arithmetic<Real>::NA();
284 var_ = Arithmetic<Real>::NA();
285 uvar_ = Arithmetic<Real>::NA();
286 std_ = Arithmetic<Real>::NA();
287 ustd_ = Arithmetic<Real>::NA();
288 mad_ = Arithmetic<Real>::NA();
289 kurtosis_ = Arithmetic<Real>::NA();
290 skewness_ = Arithmetic<Real>::NA();
291 quartiles_ = Arithmetic<Real>::NA();
292 deciles_ = Arithmetic<Real>::NA();
293 viceciles_ = Arithmetic<Real>::NA();
294 percentiles_ = Arithmetic<Real>::NA();
295
296 initializeVariableAndWeights();
297 compOrderStatistics();
298 compWeightedStatistics();
299 }
300
302 inline const int nbSamples() const {return nbSamples_;}
304 inline const int nbAvailableSamples() const {return nbAvailable_;}
306 inline const int nbMissingSamples() const {return nbMiss_;}
307
309 inline const Real min() const {return min_;}
311 inline const Real max() const {return max_;}
313 inline const Real aMax() const {return amax_;}
314
316 inline const Real mean() const {return mean_;}
318 inline const Real median() const {return median_;}
319
321 inline const Real variance() const {return var_;}
323 inline const Real unbiasedVariance() const {return uvar_;}
325 inline const Real std() const {return std_;}
327 inline const Real unbiasedStd() const {return ustd_;}
329 inline const Real mad() const { return( mad_);}
330
332 inline const Real kurtosis() const {return kurtosis_;}
334 inline const Real skewness() const {return skewness_;}
335
337 inline Variable<Real> const& quartiles() const {return quartiles_;}
339 inline Variable<Real> const& deciles() const {return deciles_;}
341 inline Variable<Real> const& viceciles() const { return viceciles_;}
343 inline Variable<Real> const& percentiles() const { return percentiles_;}
344
351 template<class OtherArray>
353 {
354 if (!sorted_)
356 // number of quantiles
357 int nt = T.size(), shift = V_.begin()-1;
358 Real n1 = Real(nbAvailable_+1), nt1 = Real(nt+1);
359
360 for (int j=T.begin(), k=1; j<T.end(); j++, k++)
361 {
362 // find index of the k-th quantile
363 Real find = Real(k*n1)/nt1; // compute the index in Real
364 int tind = std::max(1, int(find)); // in int
365 int tind1 = std::min(nbAvailable_, tind+1); // next
366 Real aux = find - Real(tind); // lower ponderation
367
368 // nbAvailable_+1 not perfectly divisible ? weighting...
369 aux ? T[j] = aux * V_[shift+tind] + (1.0-aux)*V_[shift+tind1]
370 : T[j] = V_[shift+tind];
371 }
372 }
373
374 private:
379 {
380 // initialize the extreme values
381 min_ = Arithmetic<Real>::max();
382 max_ = -Arithmetic<Real>::max();
383 // discard missing values
384 for (int i=V_.lastIdx(); i>=V_.begin(); i--)
385 {
386 if ( isNA(V_[i]) )
387 {
388 // if NA
389 nbAvailable_--; // decrease nbAvailable_
390 nbMiss_++; // increase nbMiss_
391 V_.erase(i); // Delete the current row
392 }
393 else
394 {
395 min_ = std::min((Real)V_[i], min_); // update min_
396 max_ = std::max((Real)V_[i], max_); // update max_
397 }
398 }
399 // all weights are 1/n.
400 sumweights_ = 1.0;
401 sum2weights_= 1./(Real)nbAvailable_;
402
403 // no samples
404 if (nbAvailable_ == 0)
405 {
406 min_ = Arithmetic<Real>::NA();
407 max_ = Arithmetic<Real>::NA();
408 amax_ = Arithmetic<Real>::NA();
409 }
410 else // one or more samples : compute amax
411 amax_ = std::max(std::abs(min_), std::abs(max_));
412 }
413
418 {
419 // initialize the extreme values
420 min_ = Arithmetic<Real>::max();
421 max_ = -Arithmetic<Real>::max();
422
423 // discard missing values or values without weights
424 for (int i=V_.lastIdx(); i>=V_.begin(); i--)
425 {
426 if ( isNA(V_[i])||isNA(W_[i]))
427 {
428 V_.erase(i); // Delete the current row
429 W_.erase(i); // for the weights too
430 nbAvailable_--; // decrease number of samples
431 nbMiss_++; // increase number of missing values
432 }
433 else
434 {
435 min_ = std::min(V_[i], min_); // update min_
436 max_ = std::max(V_[i], max_); // update max_
437 sumweights_ += (W_[i] = std::abs((Real)W_[i]));// sum absolute weights
438 sum2weights_ += W_[i] * W_[i]; // sum squared weights
439 }
440 }
441 // no samples
442 if (nbAvailable_ == 0)
443 {
444 min_ = Arithmetic<Real>::NA();
445 max_ = Arithmetic<Real>::NA();
446 amax_ = Arithmetic<Real>::NA();
447 }
448 else // one or more samples : compute amax
449 amax_ = std::max(std::abs(min_), std::abs(max_));
450 }
451
461 {
462 // If there is no samples
463 if (nbAvailable_ == 0)
464 {
465 mean_ = Arithmetic<Real>::NA();
466 var_ = Arithmetic<Real>::NA();
467 uvar_ = Arithmetic<Real>::NA();
468 std_ = Arithmetic<Real>::NA();
469 ustd_ = Arithmetic<Real>::NA();
470 mad_ = Arithmetic<Real>::NA();
471 kurtosis_ = Arithmetic<Real>::NA();
472 skewness_ = Arithmetic<Real>::NA();
473 return;
474 }
475
476 // One observation
477 if (nbAvailable_ == 1)
478 {
479 mean_ = V_.front();
480 var_ = Arithmetic<Real>::NA();
481 std_ = 0.0;
482 ustd_ = Arithmetic<Real>::NA();
483 mad_ = 0.0;
484 kurtosis_ = 0.0;
485 skewness_ = 0.0;
486 return;
487 }
488 // get indexes
489 // get nbAvailable observation in Real
490 const Real n = (Real)nbAvailable_;
491 // first pass : compute the mean
492 // scale the samples for preventing overflows
493 mean_ = 0.;
494 {
495 if (amax_) // if the absolute maximal value is greater than 0
496 {
497 // sum samples with scaling
498 for (int i=V_.begin(); i<=V_.lastIdx(); i++)
499 mean_ += V_[i]/amax_;
500 // divide by the number of available observation
501 mean_ /= n;
502 // unscale
503 mean_ *= amax_;
504 }
505 }
506 // second pass : compute the variance, skewness and kurtosis
507 // initialize values
508 Real sum = 0.0, dev1, dev2;
509 var_ = 0.0;
510 mad_ = 0.0;
511 kurtosis_ = 0.0;
512 skewness_ = 0.0;
513 for (int i=V_.begin(); i<V_.end(); i++)
514 {
515 sum += (dev1 = V_[i]-mean_); // deviation from the mean
516 var_ += (dev2 = dev1*dev1); // squared deviation
517 mad_ += std::abs( dev1 ); // absolute deviation
518 skewness_ += dev2 * dev1; // cubic deviation
519 kurtosis_ += dev2 * dev2; // squared squared deviation
520 }
521 mad_ *= sum2weights_;
522 uvar_ = (var_ - sum*sum/n)/(n - 1.);
523 var_ = (var_ - sum*sum/n)/n;
524 std_ = sqrt((double)var_);
525 ustd_ = sqrt((double)uvar_);
526 // if there is variance
527 if (var_)
528 {
529 skewness_ /= n;
530 kurtosis_ /= n;
531
532 skewness_ /= (var_ * std_);
533 kurtosis_ /= (var_ * var_);
534 kurtosis_ -= 3.0;
535 }
536 else
537 {
538 skewness_ = 0.0;
539 kurtosis_ = 0.0;
540 }
541 // Statistics are computed
542 compstat_ = true;
543 }
544
547 {
548 // If there is no samples
549 if (nbAvailable_ == 0)
550 {
551 mean_ = Arithmetic<Real>::NA();
552 var_ = Arithmetic<Real>::NA();
553 uvar_ = Arithmetic<Real>::NA();
554 std_ = Arithmetic<Real>::NA();
555 ustd_ = Arithmetic<Real>::NA();
556 mad_ = Arithmetic<Real>::NA();
557 kurtosis_ = Arithmetic<Real>::NA();
558 skewness_ = Arithmetic<Real>::NA();
559 return;
560 }
561
562 // One observation or pathological weights
563 if ((nbAvailable_ == 1)||(sumweights_*sumweights_ <= sum2weights_))
564 {
565 mean_ = V_.front();
566 var_ = 0.;
567 uvar_ = Arithmetic<Real>::NA();
568 std_ = 0.;
569 ustd_ = Arithmetic<Real>::NA();
570 mad_ = 0.;
571 kurtosis_ = 0.;
572 skewness_ = 0.;
573 return;
574 }
575 // first pass : get the mean
576 // scale the samples for preventing overflows
577 mean_ = 0.;
578 if (amax_) // if the maximal value is greater than 0
579 {
580 // sum samples
581 for (int i=V_.begin(); i<V_.end(); i++)
582 mean_ += (W_[i]*V_[i])/amax_; // compute the mean with scaling
583 mean_ /= sumweights_; // weight the sum
584 mean_ *= amax_; // and unscale
585 }
586 // second pass : compute the variance, skewness and kurtosis
587 // initialize some values
588 Real sum = 0.0, dev1, dev2;
589 var_ = 0.0;
590 mad_ = 0.0;
591 kurtosis_ = 0.0;
592 skewness_ = 0.0;
593 for (int i=V_.begin(); i<V_.end(); i++)
594 {
595 Real weight = W_[i];
596 sum += weight * (dev1 = V_[i]-mean_); // deviation from the mean
597 var_ += weight * (dev2 = dev1*dev1); // squared value
598 mad_ += weight * std::abs(dev1); // absolute deviation from mean
599 skewness_ += weight * dev2 * dev1;
600 kurtosis_ += weight * dev2 * dev2;
601 }
602 mad_ /= sumweights_;
603 uvar_ = (var_ - sum*sum/sumweights_)/(sumweights_ - sum2weights_/sumweights_);
604 var_ = (var_ - sum*sum/sumweights_)/sumweights_;
605 std_ = sqrt((double)var_);
606 ustd_ = sqrt((double)uvar_);
607 // if there is variance
608 if (var_)
609 {
610 skewness_ /= (sumweights_ * var_ * std_);
611 kurtosis_ /= (sumweights_ * var_ * var_);
612 kurtosis_ -= 3.0;
613 }
614 else
615 {
616 skewness_ = 0.0;
617 kurtosis_ = 0.0;
618 }
619 // Stat are computed
620 compstat_ = true;
621 }
622
625 {
626 // no samples
627 if (nbAvailable_ == 0)
628 {
629 median_ = Arithmetic<Real>::NA();
630 quartiles_ = Arithmetic<Real>::NA();
631 deciles_ = Arithmetic<Real>::NA();
632 viceciles_ = Arithmetic<Real>::NA();
633 percentiles_ = Arithmetic<Real>::NA();
634 return;
635 }
636
637 // One observation
638 if (nbAvailable_ == 1)
639 {
640 median_ = max_;
641 quartiles_ = max_;
642 deciles_ = max_;
643 viceciles_ = max_;
644 percentiles_ = max_;
645 return;
646 }
647 // sort values
648 if (!sorted_)
649 {
650 // if the Variable is not weighted, we can sort it directly
651 if (!weighted_) heapSort(V_);
652 else // otherwise we have to sort V_and W_ using indirection
653 {
654 TContainer1D const& Vconst = V_;
655 Array2DVector<int> I; // auxiliary Array for indirection
656 heapSort(I, Vconst);
657 applySort1D(V_, I);
658 applySort1D(W_, I);
659 }
660 sorted_ = true;
661 }
662 // Find the Quantiles of the distribution
663 compQuantiles(quartiles_);
664 compQuantiles(deciles_);
665 compQuantiles(viceciles_);
666 compQuantiles(percentiles_);
667 // get the median
668 median_ = quartiles_[2];
669 }
670
671 protected:
672 // dimensions
676
677 // containers
678 TContainer1D V_;
679 TContainer1D W_;
680
681 // Some flag about the internal state of the object
683 bool sorted_;
686
687 // statistics
693
703
708};
709
710} // namespace Stat
711
712} // namespace STK
713
714#endif /*STK_STAT_UNIVARIATEREAL_H*/
In this file we define an implementation of the heapsort algorithm for IVector containers.
#define STKRUNTIME_ERROR_NO_ARG(Where, Error)
Definition STK_Macros.h:138
#define STK_STATIC_ASSERT_ONE_DIMENSION_ONLY(EXPR)
#define _T(x)
Let x unmodified.
Implement the interface class IVariable as Variable.
void clear()
clear the object.
The MultidimRegression class allows to regress a multidimensional output variable among a multivariat...
const Real skewness() const
get the skewness of the variable
const Real unbiasedStd() const
get the sample standard deviation of the variable
void compOrderStatistics()
Compute the Order Statistics of the variable.
Variable< Real > const & quartiles() const
get the quartiles of the variable (25%)
const Real mean() const
get the mean of the variable
void initializeVariableAndWeights()
Initialize the containers in order to discard the missing elements in the weighted case.
Real var_
Variance of the variable (division by n)
const int nbAvailableSamples() const
get the number of available samples (not missing)
Real uvar_
Unbiased Variance of the variable (division by n-1)
Real mad_
absolute deviation of the variable
Variable< Real > const & viceciles() const
get the viceciles of the variable (5%)
const Real unbiasedVariance() const
get the unbiased Variance of the variable (division by n-1)
void compStatistics()
Compute the usual statistics of the variable:
void setData(Vector const &V, Weights const &W, bool sorted=false)
Compute the statistics of a new weighted Variable.
const Real variance() const
get the variance of the variable (division by n)
Real std_
Standard deviation of the variable (n)
Real ustd_
Sample standard deviation of the variable (n-1)
Univariate(TContainer1D const &V, bool sorted=false)
Default constructor Copy locally the variable V and set dimensions.
const Real aMax() const
get the absolute maximal value
const int nbMissingSamples() const
get the number of missing samples
const Real median() const
get the median of the variable
Variable< Real > const & deciles() const
get the deciles of the varibales (10%)
const Real min() const
get the min of the variable
void compQuantiles(OtherArray &T)
Compute the quantiles of the sorted variable V_ and store the result in the array T.
TContainer1D W_
local copy of the weights
void setData(Vector const &V, bool sorted=false)
Compute the statistics of a new Variable.
Variable< Real > const & percentiles() const
get the percentiles of the variable (1%)
bool comporder_
Orders Statistics are computed ?
const int nbSamples() const
get the number of samples
const Real mad() const
get the median absolute deviation of the variable
Univariate(const Univariate &stat)
Copy constructor.
const Real kurtosis() const
get the kurtosis of the variable
Real sum2weights_
Sum of the square of the weights.
Univariate & operator=(const Univariate &stat)
Operator = : overwrite the Univariate with stat.
bool compstat_
Usuals Statistics are computed ?
const Real std() const
get the standard deviation of the variable
void initializeVariable()
Initialize the container in order to discard the missing elements in the non weighted case.
Univariate(TContainer1D const &V, TContainer1D const &W, bool sorted=false)
Default constructor for weighted variables.
void compWeightedStatistics()
Compute the usual weighted statistics of the variable.
TContainer1D V_
local copy of the variable
const Real max() const
get the max of the variable
bool isNA(Type const &x)
utility method allowing to know if a value is a NA (Not Available) value
void heapSort(Vector &T)
Sort the container T in ascending order.
void applySort1D(Vector &T, VectorInt const &I)
Apply a sorting index array to the 1D container T.
std::basic_string< Char > String
STK fundamental type of a String.
double Real
STK fundamental type of Real values.
hidden::FunctorTraits< Derived, SumOp >::Row sum(Derived const &A)
Compute the sum of A.
The namespace STK is the main domain space of the Statistical ToolKit project.
Arithmetic properties of STK fundamental types.
static Type NA()
Adding a Non Available (NA) special number.