


StratifiedShuffleSplit(n_splits=10, test_size=’default’, train_size=None, random_state=None)

class StratifiedShuffleSplit(BaseShuffleSplit):

   """Stratified Shuffle Split cross-validator

   Provides train/test indices to split data in train/test sets.

   This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.

   Note: like the ShuffleSplit strategy, stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

   Read more in the :ref:`User Guide <cross_validation>`.



   n_splits : int, default=10

   Number of re-shuffling & splitting iterations.

   test_size : float or int, default=None.  If float, should be between 0.0 and 1.0 and represent the  proportion  of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1.

   train_size : float or int, default=None. If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If  int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size.

   random_state : int or RandomState instance, default=None. Controls the randomness of the training and testing indices  produced. Pass an int for reproducible output across multiple function calls.

   See :term:`Glossary <random_state>`.





更多信息请参见:ref: ' User Guide <cross_validation> '。</cross_validation>





test_size: float或int,默认=None。如果是浮动的,则应该在0.0和1.0之间,并表示要包含在测试分割中的数据集的比例。如果int,表示测试样本的绝对数量。如果没有,则将该值设置为train size的补集。如果' ' train_size ' '也是None,它将被设置为0.1。

train_size: float或int,默认=None。如果是浮点数,则应该在0.0和1.0之间,并表示要包含在分割序列中的数据集的比例。如果int,表示train样本的绝对数量。如果没有,该值将自动设置为train size的补集。

random_state: int或RandomState实例,默认为None。控制产生的训练和测试指标的随机性。在多个函数调用之间传递可重复输出的int。

看:术语:“术语表< random_state >”。



   >>> import numpy as np

   >>> from sklearn.model_selection import StratifiedShuffleSplit

   >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])

   >>> y = np.array([0, 0, 0, 1, 1, 1])

   >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5,


   >>> sss.get_n_splits(X, y)


   >>> print(sss)

   StratifiedShuffleSplit(n_splits=5, random_state=0, ...)

   >>> for train_index, test_index in sss.split(X, y):

   ...     print("TRAIN:", train_index, "TEST:", test_index)

   ...     X_train, X_test = X[train_index], X[test_index]

   ...     y_train, y_test = y[train_index], y[test_index]

   TRAIN: [5 2 3] TEST: [4 1 0]

   TRAIN: [5 1 4] TEST: [0 2 3]

   TRAIN: [5 0 2] TEST: [4 3 1]

   TRAIN: [4 1 0] TEST: [2 3 5]

   TRAIN: [0 5 1] TEST: [3 4 2]



   def __init__(self, n_splits=10, *, test_size=None, train_size=None,


       super().__init__(n_splits=n_splits, test_size=test_size,

        train_size=train_size, random_state=random_state)

       self._default_test_size = 0.1

   def _iter_indices(self, X, y, groups=None):

       n_samples = _num_samples(X)

       y = check_array(y, ensure_2d=False, dtype=None)

       n_train, n_test = _validate_shuffle_split(

           n_samples, self.test_size, self.train_size,


       if y.ndim == 2:

           # for multi-label y, map each distinct row to a string repr

           # using join because str(row) uses an ellipsis if len(row) >


           y = np.array([' '.join(row.astype('str')) for row in y])

       classes, y_indices = np.unique(y, return_inverse=True)

       n_classes = classes.shape[0]

       class_counts = np.bincount(y_indices)

       if np.min(class_counts) < 2:

           raise ValueError("The least populated class in y has only 1"

               " member, which is too few. The minimum"

               " number of groups for any class cannot"

               " be less than 2.")

       if n_train < n_classes:

           raise ValueError(

               'The train_size = %d should be greater or '

               'equal to the number of classes = %d' %

               (n_train, n_classes))

       if n_test < n_classes:

           raise ValueError('The test_size = %d should be greater or '

               (n_test, n_classes)) # Find the sorted list of instances for

                each class:

       # (np.unique above performs a sort, so code is O(n logn)


       class_indices = np.split(np.argsort(y_indices,

        kind='mergesort'), np.cumsum(class_counts)[:-1])

       rng = check_random_state(self.random_state)

       for _ in range(self.n_splits):

           # if there are ties in the class-counts, we want

           # to make sure to break them anew in each iteration

           n_i = _approximate_mode(class_counts, n_train, rng)

           class_counts_remaining = class_counts - n_i

           t_i = _approximate_mode(class_counts_remaining, n_test,


           train = []

           test = []

           for i in range(n_classes):

               permutation = rng.permutation(class_counts[i])

               perm_indices_class_i = class_indices[i].take(permutation,



               test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])

           train = rng.permutation(train)

           test = rng.permutation(test)

           yield train, test

   def split(self, X, y, groups=None):

       """Generate indices to split data into training and test set.



       X : array-like of shape (n_samples, n_features)

           Training data, where n_samples is the number of samples

           and n_features is the number of features.

           Note that providing ``y`` is sufficient to generate the splits


           hence ``np.zeros(n_samples)`` may be used as a placeholder


           ``X`` instead of actual training data.

       y : array-like of shape (n_samples,) or (n_samples, n_labels)

           The target variable for supervised learning problems.

           Stratification is done based on the y labels.

       groups : object

           Always ignored, exists for compatibility.



       train : ndarray

           The training set indices for that split.

       test : ndarray

           The testing set indices for that split.



       Randomized CV splitters may return different results for each

        call of

       split. You can make the results identical by setting


       to an integer.


       return super().split(X, y, groups)
