Dextran scores have been updated to better choose endpoints and smoot…

…hing Added a function to score.py which uses pearson against a spline instead of against the array data, this prevents conversation of spline to array and back to spline agian
cadet · Jan 14, 2020 · d15e6fb · d15e6fb
1 parent e728a76
commit d15e6fb
Show file tree

Hide file tree

Showing 11 changed files with 151 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -319,3 +319,5 @@ __pycache__/
 /Examples/MCMC/Dextran/nsga3_mcmc_6_emcee6_fast
 /Examples/MCMC/Dextran/dextran_pulse_fast.csv
 /Examples/MCMC/Dextran/dextran_pulse_high.h5
+/Examples/Example1/Dextran/fit_nsga3_deap_new
+/Examples/Example1/Dextran/fit_nsga3_deap_sse
diff --git a/CADETMatch.egg-info/PKG-INFO b/CADETMatch.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CADETMatch
-Version: 0.5.13
+Version: 0.5.15
 Summary: CADETMatch is a parameter estimation and error modeling library for CADET
 Home-page: https://github.com/modsim/CADET-Match
 Author: William Heymann

diff --git a/CADETMatch/CADETMatch.pyproj b/CADETMatch/CADETMatch.pyproj
@@ -14,7 +14,7 @@
     <IsWindowsApplication>False</IsWindowsApplication>
     <InterpreterId>CondaEnv|CondaEnv|CADETMatch</InterpreterId>
     <LaunchProvider>Standard Python launcher</LaunchProvider>
-    <CommandLineArguments>"C:\Users\kosh_000\Documents\Visual Studio 2017\Projects\CADETMatch\Examples\MCMC\Dextran\MCMC_dextran_nsga3.json" 0</CommandLineArguments>
+    <CommandLineArguments>"F:\Parameter Estimation\Synthetic\Dextran\NSGA3_dextran_SSE.json" 6</CommandLineArguments>
     <EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
     <InterpreterArguments>
     </InterpreterArguments>

diff --git a/CADETMatch/generate_graphs.py b/CADETMatch/generate_graphs.py
@@ -472,9 +472,9 @@ def plot_2d_single(directory_path, header_x, scoreName, data, scores):
     if numpy.all(scores <= 0):
         scores = numpy.abs(scores)
 
-    score_scale = numpy.max(scores)/numpy.min(scores)
+    #score_scale = numpy.max(scores)/numpy.min(scores)
 
-    if  score_scale > 1e2:
+    if  scoreName.startswith('1-'):
         scores = numpy.log(scores)
         scoreName = 'log(%s)' % scoreName
 

diff --git a/CADETMatch/gradFD.py b/CADETMatch/gradFD.py
@@ -147,8 +147,9 @@ def filterOverlapArea(cache, checkOffspring, cutoff=0.01):
             percent = temp_area_overlap / temp_area_total
             if percent > cutoff:
                 temp_offspring.append( (percent, ind) )
+                multiprocessing.get_logger().info('kept %s with overlap (%s) in gradient descent', ind, percent)
             else:
-                multiprocessing.get_logger().info('removed %s for insufficient overlap in gradient descent', ind)
+                multiprocessing.get_logger().info('removed %s for insufficient overlap (%s) in gradient descent', ind, percent)
         else:
             multiprocessing.get_logger().info('removed %s for failure', ind)
 

diff --git a/CADETMatch/score.py b/CADETMatch/score.py
@@ -117,6 +117,46 @@ def pearson_spline(exp_time_values, sim_data_values, exp_data_values):
 
     return score, dt
 
+def pearson_spline_fun(exp_time_values, exp_data_values, sim_spline):
+    #resample to a much smaller time step to get a more precise offset
+    dt = 1e-1
+    times = numpy.arange(exp_time_values[0], exp_time_values[-1], dt)
+
+    sim_resample = sim_spline(times)
+    exp_resample = scipy.interpolate.InterpolatedUnivariateSpline(exp_time_values, exp_data_values, ext=3)(times)    
+
+    corr = scipy.signal.correlate(exp_resample, sim_resample)  #/(numpy.linalg.norm(sim_resample) * numpy.linalg.norm(exp_resample))
+
+    index = numpy.argmax(corr)
+
+    dt_pre = (index - len(times) + 1) * dt
+
+    #refine the time offset using a deterministic method (powells)
+    dt = refine_time_fun(dt_pre, exp_time_values, exp_data_values, sim_spline)
+
+    #calculate pearson correlation at the new time
+    sim_data_values_copy = sim_spline(exp_time_values - dt)
+    pear = scipy.stats.pearsonr(exp_data_values, sim_data_values_copy)[0]
+    score = pear_corr(pear)
+
+    return score, dt
+
+def refine_time_fun(dt, exp_time_values, exp_data_values, spline):
+    "refine the time using powells method (gradient free) system is not smooth enough for gradient to work well"
+    exp_time_values = numpy.array(exp_time_values)
+    exp_data_values = numpy.array(exp_data_values)
+
+    def goal_sse(offset):
+        sim_data_values_copy = spline(exp_time_values - offset)
+
+        sse = numpy.sum((exp_data_values - sim_data_values_copy)**2)
+
+        return sse
+
+    result = scipy.optimize.minimize(goal_sse, dt, method='powell')
+
+    return result.x
+
 def time_function_decay(CV_time, peak_time, diff_input=False):
     x_exp = numpy.array([0, 1.0*CV_time])
     y_exp = numpy.array([1, 0.5])

diff --git a/CADETMatch/scores/dextranSSE.py b/CADETMatch/scores/dextranSSE.py
@@ -25,20 +25,16 @@ def run(sim_data, feature):
 
     sim_time_values, sim_data_values = util.get_times_values(sim_data['simulation'], feature)
 
-    diff = feature['value'] - sim_data_values
-
     if max(sim_data_values) < max_value: #the system has no point higher than the value we are looking for
         #remove hard failure
         max_value = max(sim_data_values)
 
     exp_time_values = exp_time_values[selected]
     exp_data_zero = feature['exp_data_zero']
 
-    min_index = numpy.argmax(sim_data_values >= 1e-3*max_value)
-    max_index = numpy.argmax(sim_data_values >= max_value)
-
-    sim_data_zero = numpy.zeros(len(sim_data_values))
-    sim_data_zero[min_index:max_index+1] = sim_data_values[min_index:max_index+1]
+    sim_data_values_smooth = smoothing.smooth_data(sim_time_values, sim_data_values, 
+                                                    feature['critical_frequency'], feature['smoothing_factor'])
+    sim_data_zero = cut_zero(sim_time_values, sim_data_values_smooth, 1e-3*max_value, max_value)
 
     sse = util.sse(sim_data_zero, exp_data_zero)
 
@@ -63,8 +59,7 @@ def setup(sim, feature, selectedTimes, selectedValues, CV_time, abstol, cache):
     min_time = selectedTimes[min_index]
     min_value = smooth_value[min_index]    
 
-    exp_data_zero = numpy.zeros(len(smooth_value))
-    exp_data_zero[min_index:max_index+1] = smooth_value[min_index:max_index+1]
+    exp_data_zero = cut_zero(selectedTimes, smooth_value, 1e-3*max_value, max_value)
 
     multiprocessing.get_logger().info("Dextran %s  start: %s   stop: %s  max value: %s", name, 
                                       min_time, max_time, max_value)
@@ -85,7 +80,37 @@ def headers(experimentName, feature):
     temp = ["%s_SSE" % name,]
     return temp
 
+def cut_zero(times, values, min_value, max_value):
+    "cut the raw times and values as close to min_value and max_value as possible and set the rest to zero without smoothing"
+    data_zero = numpy.zeros(len(times))
+
+    offset = numpy.array([-1, 0, 1])
+
+    #find the starting point for the min_index and max_index, the real value could be off by 1 in either direction
+    peak_max_index = numpy.argmax(values)
+
+    max_index = numpy.argmax(values[:peak_max_index] >= max_value)
+
+    if not max_index:
+        #if max_index is zero the whole array is below the value we are searching for so just returnt the whole array
+        return values
 
+    min_index = numpy.argmax(values[:max_index] >= min_value)
+
+    check_max_index = max_index + offset
+    check_max_value = values[check_max_index]
+    check_max_diff = numpy.abs(check_max_value - max_value)    
+
+    check_min_index = min_index + offset
+    check_min_value = values[check_min_index]
+    check_min_diff = numpy.abs(check_min_value - min_value)
+
+    min_index = min_index + offset[numpy.argmin(check_min_diff)]
+    max_index = max_index + offset[numpy.argmin(check_max_diff)]
+
+    data_zero[min_index:max_index+1] = values[min_index:max_index+1]
+
+    return data_zero
 
 
 

diff --git a/CADETMatch/scores/dextranShape.py b/CADETMatch/scores/dextranShape.py
@@ -22,14 +22,13 @@ def run(sim_data, feature):
     exp_time_zero = feature['exp_time_zero']
     exp_data_zero = feature['exp_data_zero']
 
-    sim_data_zero = cut_front(sim_time_values, sim_data_values, exp_time_zero, 
+    sim_spline, sim_data_zero_sse = cut_front(sim_time_values, sim_data_values, 
                                              feature['min_value_front'], feature['max_value_front'],
-                                             feature['smoothing_factor'], feature['critical_frequency'])
-        
-    pearson, diff_time = score.pearson_spline(exp_time_zero, exp_data_zero, sim_data_zero)
+                                             feature['critical_frequency'], feature['smoothing_factor'])
+
+    pearson, diff_time = score.pearson_spline_fun(exp_time_zero, exp_data_zero, sim_spline)
 
     exp_data_zero_sse = feature['exp_data_zero_sse']
-    sim_data_zero_sse = scipy.interpolate.InterpolatedUnivariateSpline(exp_time_zero, sim_data_zero, ext=1)(sim_time_values)
 
     temp = [pearson,
             feature['offsetTimeFunction'](numpy.abs(diff_time)),
@@ -44,13 +43,11 @@ def setup(sim, feature, selectedTimes, selectedValues, CV_time, abstol, cache):
     temp = {}
     #change the stop point to be where the max positive slope is along the searched interval
     name = '%s_%s' % (sim.root.experiment_name,   feature['name'])
-    exp_time_zero, exp_data_zero, min_time, min_value, max_time, max_value, s, crit_fs = cut_front_find(selectedTimes, selectedValues, name, cache)
+    exp_time_zero, exp_data_zero, exp_data_zero_sse, min_time, min_value, max_time, max_value, s, crit_fs = cut_front_find(selectedTimes, selectedValues, name, cache)
 
     multiprocessing.get_logger().info("Dextran %s  start: %s   stop: %s  max value: %s", name, 
                                       min_time, max_time, max_value)
 
-    exp_data_zero_sse = scipy.interpolate.InterpolatedUnivariateSpline(exp_time_zero, exp_data_zero, ext=1)(selectedTimes)
-
     temp['min_time'] = feature['start']
     temp['max_time'] = feature['stop']
 
@@ -95,11 +92,11 @@ def goal(time):
     max_time = float(result.x)
     max_value = spline(float(result.x))
 
-    min_index = numpy.argmax(smooth_value >= 1e-2*max_value)
+    min_index = numpy.argmax(smooth_value >= 1e-3*max_value)
     min_time = times[min_index]
 
     def goal(time):
-        return abs(spline(time)-1e-2*max_value)
+        return abs(spline(time)-1e-3*max_value)
 
     result = scipy.optimize.minimize(goal, min_time, method='powell')
 
@@ -111,50 +108,87 @@ def goal(time):
 
     new_times = numpy.linspace(times[0], times[-1], needed_points)
     new_values = spline(new_times)
+
+    data_zero = cut_zero(new_times, new_values, min_value, max_value)
 
-    max_index = numpy.argmax(new_values >= max_value)
-    min_index = numpy.argmax(new_values >= min_value)
+    return (new_times, data_zero, cut_zero(times, smooth_value, min_value, max_value), 
+        min_time, min_value, max_time, max_value, s, crit_fs)
 
-    data_zero = numpy.zeros(needed_points)
+def cut_front(times, values, min_value, max_value, crit_fs, s):
+    max_index = numpy.argmax(values >= max_value)
 
-    data_zero[min_index:max_index+1] = new_values[min_index:max_index+1]
+    if max_index == 0:
+        #no point high enough was found so use the highest point
+        s, crit_fs = smoothing.find_smoothing_factors(times, values, None, None)
+        max_index = numpy.argmax(values)
 
-    return new_times, data_zero, min_time, min_value, max_time, max_value, s, crit_fs
+    max_time = times[max_index]
+    max_value = values[max_index]
 
-def cut_front(times, values, new_times, min_value, max_value, s, crit_fs):
     smooth_value = smoothing.smooth_data(times, values, crit_fs, s)
 
     spline = scipy.interpolate.InterpolatedUnivariateSpline(times, smooth_value, ext=1)
 
-    max_index = numpy.argmax(values >= max_value)
-    max_time = times[max_index]
-
     def goal(time):
         return abs(spline(time)-max_value)
 
     result = scipy.optimize.minimize(goal, max_time, method='powell')
 
     max_time = float(result.x)
     max_value = spline(float(result.x))
+    max_index = max(numpy.argmax(values >= max_value), max_index)
 
-    min_index = numpy.argmax(values >= min_value)
+    min_index = numpy.argmax(values[:max_index] >= min_value)
     min_time = times[min_index]
 
     def goal(time):
+        if time > max_time:
+            #don't search to the right of the max_time for a lower value
+            return 1e10
         return abs(spline(time)-min_value)
 
     result = scipy.optimize.minimize(goal, min_time, method='powell')
 
     min_time = float(result.x)
     min_value = spline(float(result.x))
 
-    new_values = spline(new_times)
+    needed_points = int( (max_time - min_time) * 10)
+    new_times_spline = numpy.linspace(min_time, max_time, needed_points)
+    new_values_spline = spline(new_times_spline)
 
-    max_index = numpy.argmax(new_values >= max_value)
-    min_index = numpy.argmax(new_values >= min_value)
+    spline = scipy.interpolate.InterpolatedUnivariateSpline(new_times_spline, new_values_spline, ext=1)
+
+    return spline, cut_zero(times, smooth_value, min_value, max_value)
+
+def cut_zero(times, values, min_value, max_value):
+    "cut the raw times and values as close to min_value and max_value as possible and set the rest to zero without smoothing"
+    data_zero = numpy.zeros(len(times))
+
+    offset = numpy.array([-1, 0, 1])
+
+    #find the starting point for the min_index and max_index, the real value could be off by 1 in either direction
+    peak_max_index = numpy.argmax(values)
+
+    max_index = numpy.argmax(values[:peak_max_index] >= max_value)
 
-    data_zero = numpy.zeros(len(new_times))    
-    data_zero[min_index:max_index+1] = new_values[min_index:max_index+1]
+    if not max_index:
+        #if max_index is zero the whole array is below the value we are searching for so just returnt the whole array
+        return values
+
+    min_index = numpy.argmax(values[:max_index] >= min_value)
+
+    check_max_index = max_index + offset
+    check_max_value = values[check_max_index]
+    check_max_diff = numpy.abs(check_max_value - max_value)    
+
+    check_min_index = min_index + offset
+    check_min_value = values[check_min_index]
+    check_min_diff = numpy.abs(check_min_value - min_value)
+
+    min_index = min_index + offset[numpy.argmin(check_min_diff)]
+    max_index = max_index + offset[numpy.argmin(check_max_diff)]
+
+    data_zero[min_index:max_index+1] = values[min_index:max_index+1]
 
     return data_zero
 

diff --git a/CADETMatch/smoothing.py b/CADETMatch/smoothing.py
@@ -52,7 +52,10 @@ def goal(crit_fs):
 
     return crit_fs
 
-def refine_smooth(times, values, x, y, start):
+def refine_smooth(times, values, x, y, start, name):
+    if name is None:
+        name = 'unknown'
+
     x = numpy.array(x)
     y = numpy.array(y)
 
@@ -211,7 +214,8 @@ def find_smoothing_factors(times, values, name, cache):
 
     s, s_knots = util.find_Left_L(all_s,knots)
 
-    s, s_knots = refine_smooth(times, values_filter, all_s, knots, s)
+    if s is not None:
+        s, s_knots = refine_smooth(times, values_filter, all_s, knots, s, name)
 
     record_smoothing(s, s_knots, crit_fs, knots, all_s, name, cache)
 

diff --git a/Examples/Example1/Dextran/NSGA3_dextran.json b/Examples/Example1/Dextran/NSGA3_dextran.json
@@ -2,14 +2,14 @@
   "CADETPath": "C:/Users/kosh_000/cadet_build/CADET-dev/MS_SMKL_RELEASE/bin/cadet-cli.exe",
   "baseDir": "C:/Users/kosh_000/Documents/Visual Studio 2017/Projects/CADETMatch/Examples/Example1/Dextran",
     "tempDir": "L:/",
-  "resultsDir": "fit_nsga3_deap",
+  "resultsDir": "fit_nsga3_deap_sse",
   "CSV": "dextran_NSGA3.csv",
   "checkpointFile": "check",
   "stopAverage": 1,
   "stopBest": 1,
   "gradCheck": 1.0,
   "gradVector": 1,
-  "searchMethod": "NSGA3",
+  "searchMethod": "Multistart",
   "mutationRate": 1.0,
   "crossoverRate": 1.0,
   "generations": 1000,
@@ -51,7 +51,7 @@
       "features": [
         {
             "name": "Pulse",
-            "type": "DextranShape"
+            "type": "DextranSSE"
         }
       ]
     }

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="CADETMatch",
-    version="0.5.13",
+    version="0.5.16",
     author="William Heymann",
     author_email="[email protected]",
     description="CADETMatch is a parameter estimation and error modeling library for CADET",