13.3. GTSDA¶

Examples

example_gtsda_checker_simple.py
example_gtsda_checker_plots.py
example_gtsda_checker_partial.py
example_gtsda_ranker_simple.py
example_gtsda_ranker_screening.py
example_gtsda_ranker_sobol.py
example_gtsda_ranker_taguchi.py
example_gtsda_selector_simple.py

13.3.1. example_gtsda_checker_simple.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#
from da.p7core import gtsda

import numpy as np

def run_example():
  """Example of correlation analysis for simple linear data
  """
  # prepare data
  number_points = 50
  number_dimensions = 2
  x = np.random.rand(number_points, number_dimensions)
  y = -0.3 * x[:, 0] + x[:, 1] + 0.05 * np.random.rand(number_points)

  print('Original dependency is: y = -0.3 * x1 + x2 + 0.05 * random()')
  print('The number of points is %d' % number_points)
  print('')

  # create GTSDA Analyzer object
  analyzer = gtsda.Analyzer()

  # perform checking procedure with default options
  result_default = analyzer.check(x=x, y=y)

  print('Results of correlation analysis with default options:')
  print('=====================================================')
  print('scores:    %s' % result_default.scores)
  print('p_values:  %s' % result_default.p_values)
  print('decisions: %s' % result_default.decisions)
  print('\n')

  # run checking procedure with Person correlation and asymptotic estimation of the p-value
  options = {'GTSDA/Checker/Technique': 'PearsonCorrelation', 'GTSDA/Checker/PValues/Method': 'Asymptotic'}
  result_asymp = analyzer.check(x=x, y=y, options=options)

  print('Results of correlation analysis with Pearson correlation\ncoefficient and "Asymptotic" estimation of the p-value:')
  print('========================================================')
  print('scores:    %s' % result_asymp.scores)
  print('p_values:  %s' % result_asymp.p_values)
  print('decisions: %s' % result_asymp.decisions)
  print('\n')

  # run checking procedure with Person correlation and permutations estimation of the p-value
  options = {'GTSDA/Checker/Technique': 'PearsonCorrelation', 'GTSDA/Checker/PValues/Method': 'Permutations'}
  result_permut = analyzer.check(x=x, y=y, options=options)

  print('Results of correlation analysis with Pearson correlation\ncoefficient and "Permutations" estimation of the p-value:')
  print('========================================================')
  print('scores:    %s' % result_permut.scores)
  print('p_values:  %s' % result_permut.p_values)
  print('decisions: %s' % result_permut.decisions)
  print('\n')

  # compute checking procedure with partial correlation coefficient
  # Note partial correlations require explicit explanatory variable
  options = {'GTSDA/Checker/Technique': 'PearsonPartialCorrelation'}

  # Let us calculate correlation between components of x and y while the other components of x are used as explanatory variables
  partial_scores = np.empty((1, number_dimensions)) # 1 is the number of outputs
  partial_p_values = np.empty((1, number_dimensions))
  partial_decisions = np.empty((1, number_dimensions))
  for input_index in range(number_dimensions):
    z = np.hstack((x[:,:input_index], x[:, (input_index + 1):]))
    result_partial_i = analyzer.check(x=x[:, input_index], y=y, z=z, options=options)
    partial_scores[:, input_index] = result_partial_i.scores
    partial_p_values[:, input_index] = result_partial_i.p_values
    partial_decisions[:, input_index] = result_partial_i.decisions

  print('Results of correlation analysis with partial correlation coefficient:')
  print('=====================================================================')
  print('scores:    %s' % partial_scores)
  print('p_values:  %s' % partial_p_values)
  print('decisions: %s' % partial_decisions)
  print('\n')

def main():
  """
  Example of GTSDA Checker usage.
  """
  print('=' * 80)
  run_example()
  print('=' * 80)

if __name__ == "__main__":
  main()

13.3.2. example_gtsda_checker_plots.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#

"""
This example illustrates the usage of check dependence functionality for various types of dependences.
You can change the technique and compare results.
"""
import os
import numpy as np
from matplotlib import pyplot as plt
from da.p7core import gtsda

def benchmark_generate_samples(n, key):
  x = np.random.random(size=n)-0.5
  if key == 0:
    y = x
  elif key == 1:
    y = 0.7 * x + 0.3 * (np.random.random(n) - 0.5)
  elif key == 2:
    y = 0.3 * x + 0.7 * (np.random.random(n) - 0.5)
  elif key == 3:
    y = 0. * x + 0.9 * (np.random.random(n) - 0.5)
  elif key == 4:
    y = -0.3 * x + 0.7 * (np.random.random(n) - 0.5)
  elif key == 5:
    y = -0.8 * x + 0.2 * (np.random.random(n) - 0.5)
  elif key == 6:
    y = -x
  elif key == 7:
    y = x
  elif key == 8:
    y = 0.5 * x
  elif key == 9:
    y = 0.2 * x
  elif key == 10:
    y = np.zeros(n)
  elif key == 11:
    y = -0.2 * x
  elif key == 12:
    y = -0.5 * x
  elif key == 13:
    y = -x
  elif key == 14:
    y = 0.7 * (0.7 * np.cos(x * 3 * np.pi) + 1. * (np.random.random(n) - 0.5))
  elif key == 15:
    x_ = x
    y_ = np.random.random(n) - 0.5
    alpha = np.arctan(0.3)
    x = x_ * np.cos(alpha) + y_ * np.sin(alpha)
    y = x_ * np.sin(alpha) - y_ * np.cos(alpha)
  elif key == 16:
    x_ = x
    y_ = np.random.random(n) - 0.5
    alpha = np.pi / 4.
    x = x_ * np.cos(alpha) + y_ * np.sin(alpha)
    y = x_ * np.sin(alpha) - y_ * np.cos(alpha)
  elif key == 17:
    y = 3 * (0.9 * x**2 + 0.3 * (np.random.random(n))) - 0.5
  elif key == 18:
    y = 3 * (0.9 * x**2 + 0.15 * (np.random.random(n)))
    for s in range(n):
      if np.random.randint(2) == 0:
        y[s] *= -1
  elif key == 19:
    phi = x * 2 * np.pi
    r = 0.8 * (1. + 0.3 * (np.random.random(n) - 0.5))
    x = r * np.cos(phi)
    y = r * np.sin(phi)
  elif key == 20:
    phi = x * 2. * np.pi
    r = np.random.random(n)*0.3
    x = r * np.cos(phi)
    y = r * np.sin(phi)
    for s in range(n):
      a = np.random.randint(4)
      if a == 0 or a == 1:
        y[s] = y[s] + 0.5
      else:
        y[s] = y[s] - 0.5
      if a == 0 or a == 2:
        x[s] = x[s] - 0.5
      else:
        x[s] = x[s] + 0.5
  elif key == 21:
    y = 0.7 * (0.9 * np.cos(x * 3. * np.pi) + 0.1 * (np.random.random(n) - 0.5))
  elif key == 22:
    x_ = x
    y_ = np.random.random(n) - 0.5
    alpha = np.arctan(0.3)
    x = x_ * np.cos(alpha) + y_ * np.sin(alpha)
    y = (x_ * np.sin(alpha) - y_ * np.cos(alpha))/2.
  elif key == 23:
    x_ = x
    y_ = np.random.random(n) - 0.5
    alpha = np.pi / 4.
    x = x_ * np.cos(alpha) + y_ * np.sin(alpha)
    y = (x_ * np.sin(alpha) - y_ * np.cos(alpha)) / 4.
  elif key == 24:
    y = 3 * (0.95 * x**2 + 0.05 * (np.random.random(n))) - 0.5
  elif key == 25:
    y = 3 * (0.95 * x**2 + 0.01 * (np.random.random(n)))
    for s in range(n):
      if np.random.randint(2) == 0:
        y[s] *= -1
  elif key == 26:
    phi = x * 2 * np.pi
    r = 0.8 * (1. + 0.1 * (np.random.random(n) - 0.5))
    x = r * np.cos(phi)
    y = r * np.sin(phi)
  elif key == 27:
    phi = x * 2. * np.pi
    r = np.random.random(n)*0.2
    x = r * np.cos(phi)
    y = r * np.sin(phi)
    for s in range(n):
      a = np.random.randint(4)
      if a == 0 or a == 1:
        y[s] = y[s] + 0.5
      else:
        y[s] = y[s] - 0.5
      if a == 0 or a == 2:
        x[s] = x[s] - 0.5
      else:
        x[s] = x[s] + 0.5

  return x.reshape(-1, 1), y.reshape(-1, 1)

def calc_correlation(x, y, corrtype):
  analyzer = gtsda.Analyzer()
  options = {'gtsda/checker/technique':corrtype, 'gtsda/checker/pvalues/enable':False}
  analyzer.options.set(options)

  return analyzer.check(x, y).scores[0][0]

def make_big_plot(corrtype='distancecorrelation', save=False):
  fig = plt.figure(figsize=(21, 12))
  fig.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.4, hspace=0.4)
  for k in range(28):
    x, y = benchmark_generate_samples(1000, k)
    plt.subplot(4, 7, 1 + k)
    plt.axis([-1, 1, -1, 1])
    plt.scatter(x, y, marker='.', color='b')

    printedvalue = calc_correlation(x, y, corrtype)
    print("Sample #%d, score: %s" % (k, printedvalue))

    plt.text(-0.8, 0.8, str(round(printedvalue, 15)), color='r')
  if save == True:
    plt.savefig('example_gtsda_checker_'+corrtype + '.png')
  if 'SUPPRESS_SHOW_PLOTS' not in os.environ:
    plt.show()


if __name__ == "__main__":
  make_big_plot(corrtype="distancecorrelation", save=True)

13.3.3. example_gtsda_checker_partial.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#

"""
This example illustrates the fundamental usage of correlation analysis in treating high-dimensional
problems. Commonly used Pearson correlation fails to correctly detect dependency in this case.
This difficulty is solved by using partial Pearson correlation. The difference is due to the fact
that the method excludes the possible influence of other inputs when calculating correlation between
the considered input and output.
"""
#[0]

from da.p7core import gtsda
import os
import numpy as np

#[m1]
def run():
  dirpath = os.path.dirname(__file__)
  filepath = os.path.join(dirpath, 'TAXI2000.csv')
  print("Load data from %s" % filepath)

  data = np.loadtxt(filepath, delimiter=",")

  x = data[:, :-1]
  y = data[:, -1:]

#[m2]

  print('Create analyzer object.')
  print('The Pearson partial correlation will be used in current case.')
  analyzer = gtsda.Analyzer()

  print('Compute score values...')

  scores_partial = []
  scores_pearson = []
  for input_index in range(x.shape[1]):
    z = np.hstack((x[:,:input_index], x[:, (input_index + 1):])) # let other x columns be the explanatory variables matrix

    # Calculate scores with partial Pearson correlation coefficient
    result = analyzer.check(x=x[:, input_index], y=y, z=z, options={'GTSDA/Checker/Technique': 'PearsonPartialCorrelation'})
    # use only statistically significant correlations
    if result.decisions[0, 0]:
      scores_partial.append(np.fabs(result.scores[0, 0]))
    print(' feature #%-3d: partial Pearson score=%-15.5g p-value=%-15.5g decision: %d' % (1 + input_index, result.scores[0, 0], result.p_values[0, 0], result.decisions[0, 0]))

    # Calculate scores with Pearson correlation coefficient
    result = analyzer.check(x=x[:, input_index], y=y, z=z, options={'GTSDA/Checker/Technique': 'PearsonCorrelation'})
    # use only statistically significant correlations
    if result.decisions[0, 0]:
      scores_pearson.append(np.fabs(result.scores[0, 0]))
    print(' feature #%-3d:         Pearson score=%-15.5g p-value=%-15.5g decision: %d' % (1 + input_index, result.scores[0, 0], result.p_values[0, 0], result.decisions[0, 0]))
    print('')
#[m3]

  # Convert correlation coefficients to sorted list of scores
  scores_partial = sorted(scores_partial, reverse=True)
  scores_pearson = sorted(scores_pearson, reverse=True)

  print('\nTotal features number: %d' % x.shape[1])
  print('The number of statistically significant scores based on the partial Pearson correlation coefficients: %d' % len(scores_partial))
  print('The number of statistically significant scores based on the Pearson correlation coefficients: %d\n' % len(scores_pearson))

  print('Statistically significant scores based on the partial Pearson correlation coefficients: %s\n' % scores_partial)
  print('Statistically significant scores based on the Pearson correlation coefficients: %s\n' % scores_pearson)

  plot(scores_partial, scores_pearson)

#[m4]
def plot(scores_partial, scores_pearson):
  try:
    import matplotlib.pyplot as plt

    print('Plotting...')
    # GTSDA scores
    plt.subplot(111)
    features_number = max(len(scores_partial), len(scores_pearson))
    plt.scatter(np.arange(1, 1 + len(scores_partial)), scores_partial, s=10, c='r', label='Pearson partial correlation')
    plt.scatter(np.arange(1, 1 + len(scores_pearson)), scores_pearson, s=10, c='b', label='Pearson correlation')
    plt.xlabel('Feature number')
    plt.ylabel('Statistically significant score')
    plt.grid(True)
    plt.legend(loc='best')
    # save and show plots
    name = 'gtsda_example_checker_partial'
    plt.savefig(name)
    print('Plots are saved to %s.png' % os.path.join(os.getcwd(), name))
    print('On the plot we see that relative score values closely resemble the index of variability.')
    print('From this plot one may conclude that there are only 12 important variables in the considered region and the rest 151 may be dropped in the analysis.')
    if 'SUPPRESS_SHOW_PLOTS' not in os.environ:
      plt.show()
  except ImportError:
    print('Plotting is not available due to the matplotlib library absence.')

#[m0]
if __name__ == "__main__":
  # run GTSDA example
  run()

13.3.4. example_gtsda_ranker_simple.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#
from da.p7core import blackbox, gtsda
from da.p7core.loggers import StreamLogger
import numpy as np

class ExampleBlackbox(blackbox.Blackbox):
  """
  Problem representation for GTSDA ranker in blackbox mode
  """
  def prepare_blackbox(self):
    # add new variable in problem
    self.add_variable((0, 1))
    self.add_variable((0, 1))
    self.add_variable((0, 1))
    # add new response in problem
    self.add_response()

  def evaluate(self, queryx):
    result = []
    for x in queryx:
      result.append(sum(x))
    return result


def blackbox():
  """
  Example for estimate variable scores for a blackbox
  """
  # create ranker
  ranker = gtsda.Analyzer()
  # set options
  ranker.options.set("GTSDA/Seed", 100)
  # set logger, by default StreamLogger output to sys.stdout
  ranker.set_logger(StreamLogger())

  # create problem
  bbox = ExampleBlackbox()
  budget = 350
  # get result
  result = ranker.rank(blackbox=bbox, budget=budget)
  # print some info about result
  print(str(result))
  print("\nResults with default options (screening indices are selected):")
  print('-' * 60)
  for i, s in enumerate(result.scores):
    print('score for blackbox response[%d]: %s' % (i, s))
  print('-' * 60)

  result = ranker.rank(blackbox=bbox, budget=budget, options={'GTSDA/Ranker/Technique':'sobol'})
  # print some info about result
  print(str(result))
  print("\nResults with Sobol indices:")
  print('-' * 60)
  for i, s in enumerate(result.scores):
    print('score for blackbox response[%d]: %s' % (i, s))
  print('-' * 60)


def sample():
  """
  Example for estimate variable scores for input variables with respect to each output variable based on a "solid" sample given by user
  """
  # prepare data
  # note, than tool need big training sample
  number = 1000
  input_dimension = 4
  output_dimension = 2
  np.random.seed(100)
  # input part of sample
  x = np.random.rand(number, input_dimension)
  # output part of sample
  y = np.hstack((x[:, [0]] + 2 * x[:, [1]], x[:, [3]]))

  # create ranker
  ranker = gtsda.Analyzer()
  # set Logger
  ranker.set_logger(StreamLogger())

  # get result
  result = ranker.rank(x=x, y=y)
  # print info about results:
  print(str(result))
  print("\nResults:")
  print('-' * 60)
  for i, s in enumerate(result.scores):
    print('score for output[%d]: %s' % (i, s))
  print('-' * 60)

def main():
  """
  Example of GTSDA Ranker usage.
  """
  print('=' * 60)
  # example for Sample-based type of algorithm
  sample()
  print('=' * 60)
  # example for Blackbox-based type of algorithm
  blackbox()
  print('=' * 60)

if __name__ == "__main__":
  main()

13.3.5. example_gtsda_ranker_screening.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#

from da.p7core import gtsda
import numpy as np

def main():
  """
  Example of screening indices computation with GTSDA Ranker.
  """
  # prepare data
  number_points = 100
  input_dimension = 4
  np.random.seed(100)
  # input part of sample
  x = np.random.rand(number_points, input_dimension) * 2 - 1
  # output part of sample
  y = x[:, 0] + 2 * x[:, 1] + x[:, 2]**2 + x[:, 3]**3

  # doing analysis...
  analyzer = gtsda.Analyzer()
  rank_result = analyzer.rank(x=x, y=y, options={'GTSDA/Ranker/Technique': 'screening'})
  # or just rank_result = analyzer.rank(x=x, y=y) as 'screening' is the default index type

  # and reading results...
  mu_star = rank_result.info['Ranker']['Detailed info']['mu_star']
  mu = rank_result.info['Ranker']['Detailed info']['mu']
  sigma = rank_result.info['Ranker']['Detailed info']['sigma']

  print("mu_star: %s" % mu_star)
  print("mu: %s" % mu)
  print("sigma: %s" % sigma)

if __name__ == "__main__":
  main()

13.3.6. example_gtsda_ranker_sobol.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#

from da.p7core import gtsda
import numpy as np

def main():
  """
  Example of Sobol indices computation with GTSDA Ranker.
  """
  # prepare data
  number_points = 2000
  input_dimension = 4
  np.random.seed(100)
  # input part of sample
  x = np.random.rand(number_points, input_dimension) * 2 - 1
  # output part of sample
  y = x[:, 0]**2 + 2 * x[:, 0] * x[:, 1] + x[:, 2]**2

  # doing analysis...
  analyzer = gtsda.Analyzer()
  rank_result = analyzer.rank(x=x, y=y, options={'GTSDA/Ranker/Technique': 'sobol'})

  # and reading results...
  total_indices = rank_result.info['Ranker']['Detailed info']['Total indices']
  main_indices = rank_result.info['Ranker']['Detailed info']['Main indices']
  interact_indices = rank_result.info['Ranker']['Detailed info']['Interaction indices']

  print("Total indices: %s" % total_indices)
  print("Main indices: %s" % main_indices)
  print("Intearaction indices: %s" % interact_indices)

if __name__ == "__main__":
  main()

13.3.7. example_gtsda_ranker_taguchi.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#

from da.p7core import gtsda
import numpy as np

def main():
  """
  Example of Taguchi indices computation with GTSDA Ranker.
  """

  # define sample
  x = np.array([[100, 2, 4, 0.1],
                [100, 5, 6, 0.2],
                [100, 8, 8, 0.3],
                [150, 2, 6, 0.3],
                [150, 5, 8, 0.1],
                [150, 8, 4, 0.2],
                [200, 2, 8, 0.2],
                [200, 5, 4, 0.3],
                [200, 8, 6, 0.1]])
  y = np.array([[87.3, 82.3, 70.7],
                [74.8, 70.7, 63.2],
                [56.5, 54.0, 45.7],
                [79.8, 78.2, 62.3],
                [77.3, 76.5, 54.0],
                [89.0, 87.3, 83.2],
                [64.8, 62.3, 55.7],
                [99.0, 93.2, 87.3],
                [75.7, 74.0, 63.2]])
  x = np.tile(x, (3, 1))
  y = y.reshape(27, 1, order='F')

  # set options
  options = {"GTSDA/Ranker/Technique": "Taguchi",
             "GTSDA/Ranker/Taguchi/Method": "signal_to_noise"}

  # rank
  ranker = gtsda.Analyzer()
  result = ranker.rank(x=x, y=y, options=options)

  # print result
  print(str(result.scores))

if __name__ == "__main__":
  main()

13.3.8. example_gtsda_selector_simple.py ¶

#
# coding: utf-8
# Copyright (C) pSeven SAS, 2010-present
#
from da.p7core import gtsda
from da.p7core.loggers import StreamLogger
import numpy as np
import sys

def _make_sample(sample_size, input_dim, func):
  x = np.random.rand(sample_size, input_dim)
  # output part of sample
  y = func(x)
  return x, y

def mystery_function(x):
  """Example function.

  Args:
    x: 2D point or points batch (a list of two NumPy arrays).

  Returns:
    Single function value, or an array of values. Array shape is the same
    as input shape.
  """
  term1 = x[:, 1] - 5 * x[:, 0] * x[:, 0]
  term2 = 1 - 5 * x[:, 0]
  term3 = 2 - 5 * x[:, 1]
  term4 = np.sin(2.5 * x[:, 0]) * np.sin(17.5 * x[:, 0] * x[:, 1])
  result = 2 + 0.25 * term1 * term1 + term2 * term2 + 2 * term3 * term3 + 7 * term4 + 10 * x[:, 2]**2

  return result

def get_ranking(x, y, analyzer, direction='dec'):
  try:
    scores = analyzer.rank(x=x, y=y).scores[0]
    error = None
  except:
    error = "Error occurred in the ranking procedure: %s." % sys.exc_info()[1]
    scores = None

  if error is not None:
    raise Exception(error)

  # Get ranks based on scores
  if direction == 'inc':
    ranks = np.argsort(scores)
  elif direction == 'dec':
    ranks = np.argsort(-scores)
  else:
    raise ValueError("Direction of the ranking should be 'inc' or 'dec'")

  print('SCORES: %s' % str(scores))
  print('RANKS: %s' % str(ranks))

  return ranks, scores

def run_example():
  """Example for estimate variable scores for input variables with respect to each output variable based on a "solid" sample given by user
  """
  # prepare data
  sample_size = 100
  input_dim = 4
  np.random.seed(100)
  x, y = _make_sample(sample_size, input_dim, mystery_function)

  optimal_feature_list = [0, 1, 2]

  # create analyzer
  analyzer = gtsda.Analyzer()
  # set Logger
  analyzer.set_logger(StreamLogger())

  ranking, _ = get_ranking(x, y, analyzer)

  # get results with internal validation error computation (IV)
  options = {'GTSDA/Selector/ValidationType': 'internal'}
  result_internal = analyzer.select(x=x, y=y, ranking=ranking, options=options)

  # get results with train sample error computation
  options = {'GTSDA/Selector/ValidationType': 'trainsample'}
  result_test = analyzer.select(x=x, y=y, ranking=ranking, options=options)

  print("\nOptimal features: %s" % optimal_feature_list)
  print("Selected features with IV: %s" % result_internal.feature_list[:, 0])
  print("Selected features with train sample validation: %s" % result_test.feature_list[:, 0])

def main():
  """
  Example of GTSDA Selector usage.
  """
  print('=' * 60)
  run_example()
  print('=' * 60)

if __name__ == "__main__":
  main()