Source code for src.conditionedSampling

#
# Novel constrained sequential Latin Hypercube (with multidimensional uniformity) method
# Jan 2024
# author: Christina Schenk

#Python Packages:
import numpy as np
import pandas as pd
from scipy.stats import qmc
import lhsmdu
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sympy.utilities.iterables import multiset_permutations
from pathlib import Path  
#from itertools import permutations
import time
import random
import math
import itertools
from itertools import permutations
from scipy.spatial import minkowski_distance, distance_matrix
import csv
from sklearn.preprocessing import StandardScaler
import seaborn as sns

#---------------------------------------------------------------------------------------------------#
#### Latin Hypercube sampling on domain with variables summing up to 1: Sequentially and permutations such that condition holds (similar to Petelet et al (2010) but there for inequality constraints): 



[docs]
def scale(sample,
          bounds):
    """ 
    scales the sample to be within the bounds
    
    Parameters
    ----------
    sample: vector of sample
    bounds: list of upper and lower bounds
    
    Returns
    -------
    scaled sample
    """
    return qmc.scale(sample, bounds[:][0], bounds[:][1])




[docs]
def clip_and_scale(sample,
                   bounds):
    """
    clips and scales the sample to be within the bounds
    
    Parameters
    ----------
    sample: vector of sample
    bounds: list of upper and lower bounds
    
    Returns
    -------
    Clipped and scaled sample
    """
    return np.clip(sample, bounds[:][0], bounds[:][1])




[docs]
def generate_lhs_sample(dimension,
                        bounds,
                        rs,
                        method="LHS",
                        n_samp=10):
    """
    generates LHS or LHSMDU sample
        
    Parameters
    ----------
    dimension: integer of dimension of input space
    bounds: list of upper and lower bounds
    rs: random seed, default 1234
    method: string of method: LHS or LHSMDU
    n_samp: number of samples to be generated, default is 10
    
    Returns
    -------
    scaled LHS or LHSMDU samples
    
    """
    if method == "LHS":
        sampler_new = qmc.LatinHypercube(d=dimension, seed=rs)
        sample = sampler_new.random(n=n_samp)
    elif method == "LHSMDU":
        sample = lhsmdu.sample(dimension, n_samp, randomSeed=rs).transpose()
    else:
        raise ValueError("Invalid method. Please specify 'LHS' or 'LHSMDU'.")

    return scale(sample, bounds)




[docs]
def handle_dim_1(bounds,
                 method,
                 n_samp,
                 rs,
                 verbose=False):
    """
    handles sampling for dimension=1 
        
    Parameters
    ----------
    bounds: list of upper and lower bounds
    method: string of method: LHS or LHSMDU
    n_samp: number of samples to be generated
    rs: random seed, default 1234
    verbose: show prints and more info or not, default=False
    
    Returns
    -------
    feasible samples
    """
    x1_lb, x1_ub = bounds[0]
    sample1 = generate_lhs_sample(1, np.array([x1_lb, x1_ub]), rs,  method, n_samp)
    return sample1




[docs]
def handle_dim_2(bounds,
                 method,
                 n_samp,
                 rs,
                 verbose=False):
    """
    handles sampling for dimension=2 
        
    Parameters
    ----------
    bounds: list of upper and lower bounds
    method: string of method: LHS or LHSMDU
    n_samp: number of samples to be generated
    rs: random seed, default 1234
    verbose: show prints and more info or not, default=False
    
    Returns
    -------
    feasible samples
    """
    x1_lb, x1_ub = bounds[0]
    x2_lb, x2_ub = bounds[1]

    sample1 = generate_lhs_sample(1, np.array([x1_lb, x1_ub]), rs, method, n_samp)
    sample2 = clip_and_scale(1.0 - sample1, np.array([x2_lb, x2_ub]))

    return sample1, sample2




[docs]
def handle_dim_greater_than_2(bounds,
                              method,
                              n_samp,
                              max_iter,
                              max_iter_dim2,
                              max_iter_dim3,
                              max_rej,
                              rs,
                              verbose=False):
    """
    handles sampling for dimension>2 
        
    Parameters
    ----------
    bounds: list of upper and lower bounds
    method: string of method: LHS or LHSMDU
    n_samp: number of samples to be generated
    max_iter: maximum number of total iterations for dimension 1,2,3 to be feasible, if does not find minimum number of samples after this, breaks
    max_iter_dim2: maximum number of iterations for dimension 1,2 to be feasible
    max_iter_dim3: maximum number of iterations for dimension 1,2,3 to be feasible (for dimension=4)
    max_rej: integer number of maximum allowed samples to be rejected
    rs: random seed, default 1234
    verbose: show prints and more info or not, default=False
    
    Returns
    -------
    sample1, sample2, sample3: vectors
    """
    dim = len(bounds)
    start = time.time()
    infeasible = True
    l = 0
    l1 = 0
    x1_lb = bounds[0][0]
    x1_ub = bounds[0][1]
    x2_lb = bounds[1][0]
    x2_ub = bounds[1][1]

    while infeasible:
        if l > max_iter:
            print("No feasible sample found after " + str(l) + " iterations. Please increase max_rej, max_iter or check your bounds to find a feasible sample.")
        Sumsamp = np.zeros((n_samp,n_samp)) 
        C_1 = np.zeros((n_samp, n_samp))
        a_ind = []
        A = []
        B = []
        sample1 = generate_lhs_sample(1, np.array([x1_lb, x1_ub]), rs, method, n_samp)
        sample2 = generate_lhs_sample(1, np.array([x2_lb, x2_ub]), rs, method, n_samp)
        for i in range(len(sample1)):
            for k in range(len(sample2)):
                Sumsamp[i, k] = sum(sample1[i], sample2[k])
                if 0 < Sumsamp[i, k] <= 1.:
                    C_1[i, k] = 1
                    a_ind.append(tuple((i, k)))
                    if i not in A and k not in B:
                        A.append(i)
                        B.append(k)

        if len(A) == n_samp and len(B) == n_samp:
            sample1 = sample1[A]
            sample2 = sample2[B]
            sum_vec = np.add(sample1, sample2)
            infeasible = False
        elif len(A) != n_samp and len(B) != n_samp:
            if l1 > max_iter_dim2:
                if len(A) >= n_samp - max_rej and len(B) >= n_samp - max_rej:
                    sample1 = sample1[A]
                    sample2 = sample2[B]
                    sum_vec = np.add(sample1, sample2)
                    if verbose:
                        print(n_samp - len(A), "samples were rejected")
                    infeasible = False
                    n_samp = len(A)
            else:
                notA = []
                notB = []
                for ind in range(n_samp):
                    if ind not in A:
                        notA.append(ind)
                    if ind not in B:
                        notB.append(ind)
                count1=0
                counter_pair1=0
                for m in notA:
                    if counter_pair1>n_samp//4:
                        if verbose:
                            print("counter_pair exceeded")
                        break
                    if m not in list(zip(*a_ind))[0]:
                        count1+=1
                        #print("maybe need new sample or to reject")
                    if max_rej>0:
                        if count1>max_rej-1:
                            if verbose:
                                print("get new sample")
                            break

                    el0list = [ik for ik, (v,*_) in enumerate(a_ind) if v==m]
                    if len(el0list)>0:
                        #choose random tuple from that list:
                        chosentuple = np.random.choice(range(len(el0list)))

                        # if second index from random tuple not in D:
                        if a_ind[chosentuple][1] in B:
                            ind_rand2 = [i for i, x in enumerate(B) if x == a_ind[chosentuple][1]]
                            if len(ind_rand2)>0:
                                A.remove(A[ind_rand2[0]])
                                B.remove(a_ind[chosentuple][1])
                                notA.append(a_ind[chosentuple][0])
                                notB.append(a_ind[chosentuple][1])
                                A.append(m)
                                B.append(a_ind[chosentuple][1])
                                notA.remove(m)
                                notB.remove(a_ind[chosentuple][1])

                                counter_pair1+=1

                        else:
                            A.append(m)
                            B.append(a_ind[chosentuple][1])
                            notA.remove(m)
                            notB.remove(a_ind[chosentuple][1])

                        if len(A) == n_samp and len(B) == n_samp:
                            sample1 = sample1[A]
                            sample2 = sample2[B]
                            sum_vec = np.add(sample1, sample2)
                            infeasible = False
        
        l1 += 1

    del C_1
    del Sumsamp
    del a_ind
        
    if infeasible == False and dim==3:
        sample3 = np.zeros((len(sample1), 1))
        for i in range(len(sample1)):
            sample3[i] = 1.0-(sample1[i]+sample2[i])
            if np.any(sample3>bounds[2][1]) or np.any(sample3<bounds[2][0]):
                rej_list = [] 
                keep_list = []
                for i in range(len(sample3)):
                    if sample3[i]>bounds[2][1] or sample3[i]<bounds[2][0]:
                        if verbose:
                            print("reject because exceeding bounds")
                            print("index exceeding bounds", i)
                        rej_list.append(i)
                    else:  
                        keep_list.append(i)
                if len(rej_list)>n_samp-max_rej:
                    if verbose:
                        print("Warning:", len(rej_list), "samples were rejected because they were not within the bounds.") 
                sample1 = sample1[keep_list]
                sample2 = sample2[keep_list]
                sample3 = sample3[keep_list]
        del A
        del B
        if verbose:
            print("sample1", sample1)
            print("sample2", sample2)
            print("sample3", sample3)
            print("Sum all", sample1+sample2+sample3)

        end = time.time()
        if verbose:
            print("The conditioned " + str(method) + " algorithm took ", end-start, "CPUs")
        return sample1, sample2, sample3 
    
    if dim > 3:
        infeasible1, sample1, sample2, sample3 = handle_dim_greater_than_3(bounds, method, n_samp, max_iter, max_iter_dim3, max_rej, sum_vec, sample1, sample2, rs)
        l += 1
        if infeasible == False and infeasible1 == False:
            sample4 = np.zeros((len(sample1), 1))

            for i in range(len(sample4)):
                sample4[i] = 1.0-(sample1[i]+sample2[i]+sample3[i])

            if np.any(sample4>bounds[3][1]) or np.any(sample4<bounds[3][0]):
                rej_list = [] 
                keep_list = []
                for i in range(len(sample4)):
                    if sample4[i]>bounds[3][1] or sample4[i]<bounds[3][0]:
                        if verbose:
                            print("reject because exceeding bounds")
                            print("index exceeding bounds", i)
                        rej_list.append(i)
                    else:  
                        keep_list.append(i)
                if len(rej_list)>n_samp-max_rej:
                    if verbose:
                        print("Warning:", len(rej_list), "samples were rejected because they were not within the bounds.")
                sample1 = sample1[keep_list]
                sample2 = sample2[keep_list]
                sample3 = sample3[keep_list]
                sample4 = sample4[keep_list]
 
            if verbose:
                print("sample1", sample1)
                print("sample2", sample2)
                print("sample3", sample3)
                print("sample4", sample4)
                print("Sum all", sample1+sample2+sample3+sample4)

            end = time.time()
            if verbose:
                print("The conditioned " + str(method) + " algorithm took ", end-start, "CPUs")
            return sample1, sample2, sample3, sample4
        
        del C_2
        del Sumsamp2
        del A
        del B
        del C
        del D
        del c_ind



    

[docs]
def handle_dim_greater_than_3(bounds,
                              method,
                              n_samp,
                              max_iter,
                              max_iter_dim3,
                              max_rej,
                              sum_vec,
                              sample1,
                              sample2,
                              rs,
                              verbose=False):
    """
    handles sampling for dimension>3
        
    Parameters
    ----------
    bounds: list of upper and lower bounds
    method: string of method: LHS or LHSMDU
    n_samp: number of samples to be generated
    max_iter: maximum number of total iterations for dimension 1,2,3 to be feasible, if does not find minimum number of samples after this, breaks
    max_iter_dim3: maximum number of iterations for dimension 1,2,3 to be feasible (for dimension=4)
    max_rej: integer number of maximum allowed samples to be rejected
    sum_vec: vector of sum of compatible sample1 and sample2
    sample1: vector of feasible sample1 
    sample2: vector of feasible sample1
    rs: random seed, default 1234
    verbose: show prints and more info or not, default=False
    
    Returns
    -------
    infeasible1: boolean flag
    sample1, sample2, sample3: vectors
    """
    
    x3_lb = bounds[2][0]
    x3_ub = bounds[2][1]
    infeasible1 = True
    l2 = 0
    
    while infeasible1:
        sample3 = generate_lhs_sample(1, np.array([x3_lb, x3_ub]), rs, method, n_samp)
        Sumsamp2 = np.zeros((n_samp,n_samp)) 
        C_2 = np.zeros((n_samp, n_samp))
        c_ind = []
        C=[]
        D=[]
        notC = []
        notD = []
        for i2 in range(len(sum_vec)):
            for k2 in range(len(sample3)):
                Sumsamp2[i2,k2] = sum_vec[i2]+sample3[k2]
                if Sumsamp2[i2,k2]<=1. and Sumsamp2[i2,k2]>0:
                    C_2[i2,k2] = 1
                    c_ind.append(tuple((i2, k2)))
                    if i2 not in C and k2 not in D:
                        C.append(i2)
                        D.append(k2)
        if len(C) == n_samp and len(D) == n_samp:
            sample1 = sample1[C]
            sample2 = sample2[C]
            sample3 = sample3[D]
            infeasible1 = False

        if len(C)!=n_samp and len(D)!=n_samp:
            if len(C)>=n_samp-max_rej and len(C)<n_samp:
                if verbose:
                    print("rejected", n_samp-len(C), " samples")
                sample1 = sample1[C]
                sample2 = sample2[C]
                sample3 = sample3[D]
                infeasible1 = False
                break
            if l2>max_iter_dim3:
                if len(C)>=n_samp-max_rej and len(D)>=n_samp-max_rej: #2*n_samp
                    sample1 = sample1[C]
                    sample2 = sample2[C]
                    sample3 = sample3[D]
                    infeasible1 = False
                else:
                    infeasible=True
                    print("No feasible sample found after " + str(l2) + " iteration. Please reinitiate sampling, increase max_rej, max_iter or check your bounds to find a feasible sample.")                    
                    break
            for k in range(n_samp):
                if k not in C:
                    notC.append(k)
                if k not in D:
                    notD.append(k)
            #check for feasible tuples
            count=0
            #check for missing C indices:
            counter_pair=0
            for m in notC:
                if counter_pair>n_samp:
                    if verbose:
                        print("counter_pair exceeded")
                    break
                if m not in list(zip(*c_ind))[0]:
                    count+=1
                        #print("maybe need new sample or to reject")
                if max_rej>0:
                    if count>max_rej-1:
                        if verbose:
                            print("get new sample")
                        break
                #list of possible tuples with current missing index m:
                el0list = [i for i, (v,*_) in enumerate(c_ind) if v==m]
                if len(el0list)>0:
                    #choose random tuple from that list:
                    chosentuple = np.random.choice(range(len(el0list)))

                    # if second index from random tuple not in D:
                    if c_ind[chosentuple][1] in D:
                        ind_rand2 = [i for i, x in enumerate(D) if x == c_ind[chosentuple][1]]
                        if len(ind_rand2)>0:
                            C.remove(C[ind_rand2[0]])
                            D.remove(c_ind[chosentuple][1])
                            notC.append(c_ind[chosentuple][0])
                            notD.append(c_ind[chosentuple][1])
                            C.append(m)
                            D.append(c_ind[chosentuple][1])
                            notC.remove(m)
                            notD.remove(c_ind[chosentuple][1])
                            counter_pair+=1
                    else:
                        C.append(m)
                        D.append(c_ind[chosentuple][1])
                        notC.remove(m)
                        notD.remove(c_ind[chosentuple][1])

                    if len(C) == n_samp and len(D) == n_samp:
                        sample1 = sample1[C]
                        sample2 = sample2[C]
                        sample3 = sample3[D]
                        infeasible1 = False
            
                    
        l2 += 1
        
    return infeasible1, sample1, sample2, sample3

                            
                        

[docs]
def one_constrained_sampling(n_samp,
                             method="LHS",
                             bounds=None,
                             max_iter=60,
                             max_iter_dim2=60,
                             max_iter_dim3=60,
                             max_rej=None,
                             rs=None,
                             verbose=False):
    """
    one_constrained_sampling for up to 4 dimensions, where samples should add up to 1
        
    Parameters
    ----------
    n_samp: number of samples to be generated
    method: string of method: LHS or LHSMDU
    bounds: list of upper and lower bounds
    max_iter: maximum number of total iterations for dimension 1,2,3 to be feasible, if does not find minimum number of samples after this, breaks
    max_iter_dim2: maximum number of iterations for dimension 1,2 to be feasible
    max_iter_dim3: maximum number of iterations for dimension 1,2,3 to be feasible (for dimension=4)
    max_rej: integer number of maximum allowed samples to be rejected
    rs: random seed, default 1234
    verbose: show prints and more info or not, default=False
    
    Returns
    -------
    feasible samples from routines for appropriate number of dimensions
    """
    if max_rej is None:
        max_rej = n_samp//4
    if bounds is None or len(bounds) < 2:
        raise ValueError("Invalid bounds. Please provide valid bounds.")

    dim = len(bounds)

    if dim == 1:
        return handle_dim_1(bounds, method, n_samp, rs, verbose)

    elif dim == 2:
        return handle_dim_2(bounds, method, n_samp, rs, verbose)

    elif dim > 2:
        return handle_dim_greater_than_2(bounds, method, n_samp, max_iter, max_iter_dim2, max_iter_dim3, max_rej, rs, verbose)


    
#-------------------------------------------------------------------------------------------------#
### Collect samples for all permutations of bounds (all selected or subset of samples selected with checking for distance)


[docs]
def get_bounds_for_dimension(combi,
                             prev_bounds):
    """
    extract bounds for specific dimension
        
    Parameters
    ----------
    combi: combination number of bounds, integer
    prev_bounds: list of upper and lower bounds
    
    Returns
    -------
    bounds for this combination number
    """
    return [prev_bounds[ind] for ind in combi]



# def stack_samples(samples,
#                   dim):
#     """
#     stacks samples
        
#     Parameters
#     ----------
#     samples: vectors of samples
#     dim: integer of dimension
    
#     Returns
#     -------
#     stacked samples
#     """
#     return np.column_stack(samples)[:dim]


def select_samples(all_samples,
                   val_samples_ord,
                   tol_norm,
                   num_select,
                   all_select):
    """
    select samples based on distance
        
    Parameters
    ----------
    all_samples: vectors of samples
    val_samples_ord: ordered samples in order of first bound permutation
    tol_norm: tolerance for minimum distance
    num_select: integer, number of samples to be selected if boolean=False
    all_select: boolean, if True all samples selected, if False num_select samples selected
    
    Returns
    -------
    selected samples
    """
    if all_select:
        return all_samples + val_samples_ord
    else:
        dist_matrix = distance_matrix(all_samples, val_samples_ord)
        selected_samples = []
        for j in range(len(val_samples_ord)):
            if np.all(dist_matrix[:, j]) > tol_norm:
                selected_samples.append(val_samples_ord[j])
                
        return all_samples + selected_samples[-num_select:]
    
    

[docs]
def one_constrained_sampling_wrapper(methodname,
                                     dim,
                                     n_samp,
                                     bounds,
                                     max_rej,
                                     rs):
    """
    select samples based on distance
        
    Parameters
    ----------
    methodname: string of method, LHS or LHSMDU
    dim: dimension of input space
    n_samp: number of samples to be collected
    bounds: list of lower and upper bounds
    max_rej: maximum samples to be rejected
    rs: random seed, default 1234
    
    Returns
    -------
    one constrained feasible samples from routines for dim 2, 3, 4
    """
    if dim == 2:
        return one_constrained_sampling(n_samp, method=methodname, bounds=bounds, max_rej=max_rej, rs=rs)
    elif dim == 3:
        return one_constrained_sampling(n_samp, method=methodname, bounds=bounds, max_rej=max_rej, rs=rs)
    elif dim == 4:
        return one_constrained_sampling(n_samp, method=methodname, bounds=bounds, max_rej=max_rej, rs=rs)
    elif dim > 4:
        print("This algorithm works for maximum 4 dimensions.")
        return None


    

[docs]
def sample_with_bound_permutations(
        prev_bounds, n_samp, tol_norm=1e-3, all_select=False, 
        num_select=4, max_rej=None, dim=None, rs=1234, verbose=False):
    """
    Computes samples with bound permutations.

    Parameters
    ----------
    prev_bounds : list
        List of lower and upper bounds for sampling.
    n_samp : int
        Number of samples to generate.
    tol_norm : float, optional
        Tolerance for minimum distance between samples.
    all_select : bool, optional
        Whether to select all samples or a fixed number.
    num_select : int, optional
        Number of samples to select if `all_select` is False.
    max_rej : int, optional
        Maximum number of rejections allowed during sampling.
    dim : int
        Dimensionality of the problem.
    rs : int, optional
        Random seed for reproducibility.
    verbose : bool, optional
        Whether to print verbose output.

    Returns
    -------
    tuple
        All feasible samples generated using two methods.
    """
    def get_permuted_bounds(combi, prev_bounds):
        return [prev_bounds[idx] for idx in combi]

    def stack_samples(samples, combi):
        # Reorder samples based on permutation combination
        stacked = np.zeros_like(samples)
        for num, ind in enumerate(combi):
            stacked[:, ind] = samples[:, num]
        return stacked

    def select_samps(existing_samples, new_samples, tol_norm, num_select):
        dist_matrix = distance_matrix(existing_samples, new_samples)
        selected_indices = [
            j for j in range(len(new_samples))
            if np.min(dist_matrix[:, j]) > tol_norm
        ]
        if not all_select:
            selected_indices = sorted(
                selected_indices, 
                key=lambda idx: np.min(dist_matrix[:, idx]), 
                reverse=True
            )[:num_select]
        return new_samples[selected_indices, :]

    # Initialize
    start = time.time()
    all_perms = list(permutations(range(dim)))
    if max_rej is None:
        max_rej = n_samp // 4
    all_val_samples, all_val_samples_mdu = None, None

    # Iterate over methods
    for num_meth in range(2):
        methodname = "LHS" if num_meth == 0 else "LHSMDU"
        for perm_ind, combi in enumerate(all_perms):
            bounds = get_permuted_bounds(combi, prev_bounds)
            if verbose:
                print(f"Permutation {perm_ind}: {combi}, Bounds: {bounds}")

            samples = one_constrained_sampling(
                method=methodname, n_samp=n_samp, bounds=bounds, max_rej=max_rej
            )

            # Convert samples to stacked format
            samples_stacked = np.column_stack(samples)
            if perm_ind == 0:
                if num_meth == 0:
                    all_val_samples = samples_stacked
                else:
                    all_val_samples_mdu = samples_stacked
            else:
                val_samples_ord = stack_samples(samples_stacked, combi)
                if num_meth == 0:
                    if all_select:
                        all_val_samples = np.vstack((all_val_samples, val_samples_ord))
                    else:
                        all_val_samples = np.vstack((
                            all_val_samples, 
                            select_samps(all_val_samples, val_samples_ord, tol_norm, num_select)
                        ))
                else:
                    if all_select:
                        all_val_samples_mdu = np.vstack((all_val_samples_mdu, val_samples_ord))
                    else:
                        all_val_samples_mdu = np.vstack((
                            all_val_samples_mdu, 
                            select_samps(all_val_samples_mdu, val_samples_ord, tol_norm, num_select)
                        ))

        if verbose:
            elapsed = time.time() - start
            print(f"{methodname} method completed in {elapsed:.2f} seconds.")

    return all_val_samples, all_val_samples_mdu

   
    
###Probabilistic version that runs sample_with_bound_permutations for multiple random seeds:

[docs]
def prob_sample_with_bound_permutations(
    seeds=[42, 123, 7, 99, 56],
    prev_bounds=None, 
    n_samp=10, 
    tol_norm=1e-3, 
    all_select=False, 
    num_select=4, 
    max_rej=None, 
    dim=None, 
    verbose=False):
    """
    Perform probabilistic sampling with bound permutations for a list of seeds.
    
    Args:
        seeds (list): List of random seeds. by default list of 5 random seeds.
        prev_bounds (list): Previous bounds for sampling.
        n_samp (int): Number of samples per seed.
        tol_norm (float): Tolerance for the norm.
        all_select (bool): Whether to select all points.
        num_select (int): Number of selections to make.
        max_rej (int): Maximum rejections allowed.
        dim (int): Dimensionality of the sampling space.
        rs: random seed, default 1234
        verbose: show prints and more info or not, default=False
        
    Returns:
        tuple: Contains the selected LHS samples, selected MDU samples, and their means and standard deviations.
    """
    all_val_samples = []
    all_val_samples_mdu = []
    # Generate the samples for each seed
    for rs in seeds:
        all_val_samplesi, all_val_samples_mdui = sample_with_bound_permutations(
            prev_bounds=prev_bounds, n_samp=n_samp, tol_norm=tol_norm,
            all_select=all_select, num_select=num_select, max_rej=max_rej,
            dim=dim, rs=rs, verbose=False)
        all_val_samples.append(np.array(all_val_samplesi))  # Convert to NumPy array
        all_val_samples_mdu.append(np.array(all_val_samples_mdui))
    # Calculate the minimum length across all seeds
    min_len = min(len(samples) for samples in all_val_samples)
    min_len_mdu = min(len(samples) for samples in all_val_samples_mdu)

    # Initialize lists to store the sampled data
    all_val_samples_selected = []
    all_val_samples_mdu_selected = []

    # Randomly select `min_len` samples for each seed
    for k in range(len(seeds)):   
        # Randomly sample rows
        sampled_indices_lhs = np.random.choice(len(all_val_samples[k]), size=min_len, replace=False)
        sampled_indices_mdu = np.random.choice(len(all_val_samples_mdu[k]), size=min_len_mdu, replace=False)

        # Select the sampled rows
        sampled_lhs = all_val_samples[k][sampled_indices_lhs, :]
        sampled_mdu = all_val_samples_mdu[k][sampled_indices_mdu, :]

        # Append the sampled data to the lists
        all_val_samples_selected.append(sampled_lhs)
        all_val_samples_mdu_selected.append(sampled_mdu)

    # Stack the selected samples along a new axis (shape: num_seeds x min_len x n_dim)
    all_val_samples_selected = np.stack(all_val_samples_selected, axis=0)
    all_val_samples_mdu_selected = np.stack(all_val_samples_mdu_selected, axis=0)

    # Compute the mean and standard deviation across the seeds (axis=0)
    mean_samples = np.mean(all_val_samples_selected, axis=0)
    std_samples = np.std(all_val_samples_selected, axis=0)
    mean_samples_mdu = np.mean(all_val_samples_mdu_selected, axis=0)
    std_samples_mdu = np.std(all_val_samples_mdu_selected, axis=0)

    return (all_val_samples_selected, all_val_samples_mdu_selected, 
            mean_samples, std_samples, mean_samples_mdu, std_samples_mdu)



### Select subset of samples that varies the most in terms of distance from the already collected data

[docs]
def scale_data(data,
               decimals=3):
    """
    scales data via standard scaling
        
    Parameters
    ----------
    data: numpy array or pandas dataframe of data
    decimals: decimals to be rounded to, integer
    
    Returns
    -------
    scaled data
    """
    scaler = StandardScaler().fit(data)
    return scaler.transform(np.around(data, decimals=decimals)), scaler




[docs]
def select_samples(data_scaled,
                   samples,
                   samples_unscaled,
                   tol,
                   tol2,
                   decimals=3,
                   des_n_samp=None):
    """
    selects samples based on distance from data set
        
    Parameters
    ----------
    data_scaled: scaled data, numpy array
    samples: array of samples
    samples_unscaled: array of unscaled samples
    tol: tolerance for minimum distance to data, float
    tol2: tolerance for minimum distance to other already selected samples, float
    decimals: decimals to round to, integer
    des_n_samp: desired number of samples/experiments to be executed, default None
    
    Returns
    -------
    selected scaled rounded samples: array
    selected unscaled rounded samples: array
    selected_ind_list: list of selected indices
    """
    tollist = []
    norm_list = []

    for j in range(len(samples)):
        # Calculate the Euclidean distance between samples[j,:] and data_scaled
        distances = np.linalg.norm(data_scaled - samples[j, :], axis=1)
        if np.all(distances > tol):
            if len(tollist) == 0:
                tollist.append(j)
                norm_list.append(min([np.linalg.norm(data_scaled[i,:]-samples[j,:]) for i in range(len(data_scaled))]))
            else:
                # Calculate distances between samples[j,:] and samples in tollist
                distances_to_tollist = np.linalg.norm(samples[j, :] - samples[tollist, :], axis=1)
                if np.all(distances_to_tollist > tol2) and des_n_samp is not None and j not in tollist:
                    tollist.append(j)
                    norm_list.append(min([np.linalg.norm(data_scaled[i,:]-samples[j,:]) for i in range(len(data_scaled))]))

    sorted_norm_list = sorted(norm_list)
    selected_list = sorted_norm_list[-des_n_samp:]

    selected_norm_ind_list = [norm_list.index(i) for i in selected_list]
    selected_ind_list = [tollist[i] for i in selected_norm_ind_list]
    
    return np.around(samples[selected_ind_list, :],decimals=decimals), np.around(samples_unscaled[selected_ind_list,:], decimals=decimals), selected_ind_list




[docs]
def select_samples_diff_from_data(exp_data,
                                  samples_LHS,
                                  samples_LHSMDU,
                                  des_n_samp=15,
                                  tol=5e-1,
                                  tol2=5e-1,
                                  decimals=3):
    """
    select samples based on distance from experimental data
        
    Parameters
    ----------
    exp_data: array of experimental data
    samples_LHS: LHS samples array
    samples_LHSMDU: LHSMDU samples array
    des_n_samp: desired number of samples/experiments to be executed
    tol: tolerance for minimum distance to data, float
    tol2: tolerance for minimum distance to other already selected samples, float
    decimals: decimals to round to, integer
    
    Returns
    -------
    tol_samples: selected samples with LHS
    tol_samples_LHSMDU: selected samples with LHSMDU
    tol_samples_unscaled: selected unscaled samples with LHS
    tol_samples_LHSMDU_unscaled: selected unscaled samples with LHSMDU
    
    """
    data_scaled, scaler = scale_data(exp_data)
    samples_LHS_scaled, scaler2 = scale_data(np.around(samples_LHS, decimals=decimals))
    samples_LHSMDU_scaled, scaler3 = scale_data(np.around(samples_LHSMDU, decimals=decimals))

    tol_samples, tol_samples_unscaled, selected_ind_list1 = select_samples(data_scaled, samples_LHS_scaled, samples_LHS, tol, tol2, decimals, des_n_samp)
    tol_samples_LHSMDU, tol_samples_LHSMDU_unscaled, selected_ind_list2 = select_samples(data_scaled, samples_LHSMDU_scaled, samples_LHSMDU, tol, tol2, decimals, des_n_samp)

    return tol_samples, tol_samples_LHSMDU, tol_samples_unscaled, tol_samples_LHSMDU_unscaled