diff --git a/docs/src/BraninFunction.md b/docs/src/BraninFunction.md
index ae48ad3c9..0f1b28cfd 100644
--- a/docs/src/BraninFunction.md
+++ b/docs/src/BraninFunction.md
@@ -40,29 +40,29 @@ lower_bound = [-5, 0]
 upper_bound = [10,15]
 xys = sample(n_samples, lower_bound, upper_bound, SobolSample())
 zs = branin.(xys);
-x, y = -5:10, 0:15 # hide
-p1 = surface(x, y, (x1,x2) -> branin((x1,x2))) # hide
-xs = [xy[1] for xy in xys] # hide
-ys = [xy[2] for xy in xys] # hide
-scatter!(xs, ys, zs) # hide
-p2 = contour(x, y, (x1,x2) -> branin((x1,x2))) # hide
-scatter!(xs, ys) # hide
-plot(p1, p2, title="True function") # hide
+x, y = -5:10, 0:15
+p1 = surface(x, y, (x1,x2) -> branin((x1,x2)))
+xs = [xy[1] for xy in xys]
+ys = [xy[2] for xy in xys]
+scatter!(xs, ys, zs)
+p2 = contour(x, y, (x1,x2) -> branin((x1,x2)))
+scatter!(xs, ys)
+plot(p1, p2, title="True function")
 ```
 
-Now it's time to fitting different surrogates and then we will plot them.
-We will have a look on `Kriging Surrogate`:
+Now it's time to try fitting different surrogates and then we will plot them.
+We will have a look at the kriging surrogate `Kriging Surrogate`. :
 
 ```@example BraninFunction
-kriging_surrogate = Kriging(xys, zs, lower_bound, upper_bound, p=[1.9, 1.9])
+kriging_surrogate = Kriging(xys, zs, lower_bound, upper_bound)
 ```
 
 ```@example BraninFunction
-p1 = surface(x, y, (x, y) -> kriging_surrogate([x y])) # hide
-scatter!(xs, ys, zs, marker_z=zs) # hide
-p2 = contour(x, y, (x, y) -> kriging_surrogate([x y])) # hide
-scatter!(xs, ys, marker_z=zs) # hide
-plot(p1, p2, title="Kriging Surrogate") # hide
+p1 = surface(x, y, (x, y) -> kriging_surrogate([x y]))
+scatter!(xs, ys, zs, marker_z=zs)
+p2 = contour(x, y, (x, y) -> kriging_surrogate([x y]))
+scatter!(xs, ys, marker_z=zs)
+plot(p1, p2, title="Kriging Surrogate")
 ```
 
 Now, we will have a look on `Inverse Distance Surrogate`:
diff --git a/docs/src/kriging.md b/docs/src/kriging.md
index 06a3832b7..2f5c74f58 100644
--- a/docs/src/kriging.md
+++ b/docs/src/kriging.md
@@ -38,7 +38,7 @@ With our sampled points we can build the Kriging surrogate using the `Kriging` f
 `kriging_surrogate` behaves like an ordinary function which we can simply plot. A nice statistical property of this surrogate is being able to calculate the error of the function at each point, we plot this as a confidence interval using the `ribbon` argument.
 
 ```@example kriging_tutorial1d
-kriging_surrogate = Kriging(x, y, lower_bound, upper_bound, p=1.9);
+kriging_surrogate = Kriging(x, y, lower_bound, upper_bound);
 
 plot(x, y, seriestype=:scatter, label="Sampled points", xlims=(lower_bound, upper_bound), ylims=(-7, 17), legend=:top)
 plot!(xs, f.(xs), label="True function", legend=:top)
@@ -107,7 +107,7 @@ plot(p1, p2, title="True function") # hide
 Using the sampled points we build the surrogate, the steps are analogous to the 1-dimensional case.
 
 ```@example kriging_tutorialnd
-kriging_surrogate = Kriging(xys, zs, lower_bound, upper_bound, p=[1.9, 1.9])
+kriging_surrogate = Kriging(xys, zs, lower_bound, upper_bound, p=[2.0, 2.0], theta=[0.03, 0.003])
 ```
 
 ```@example kriging_tutorialnd
diff --git a/src/Kriging.jl b/src/Kriging.jl
index eef379a29..d5285200a 100644
--- a/src/Kriging.jl
+++ b/src/Kriging.jl
@@ -1,7 +1,8 @@
 #=
-One-dimensional Kriging method, following this paper:
+One-dimensional Kriging method, following these papers:
+"Efficient Global Optimization of Expensive Black Box Functions" and
 "A Taxonomy of Global Optimization Methods Based on Response Surfaces"
-by DONALD R. JONES
+both by DONALD R. JONES
 =#
 mutable struct Kriging{X, Y, L, U, P, T, M, B, S, R} <: AbstractSurrogate
     x::X
@@ -35,20 +36,21 @@ function std_error_at_point(k::Kriging, val)
     n = length(k.x)
     d = length(k.x[1])
     r = zeros(eltype(k.x[1]), n, 1)
-    @inbounds for i in 1:n
-        sum = zero(eltype(k.x[1]))
-        for l in 1:d
-            sum = sum + k.theta[l] * norm(val[l] - k.x[i][l])^(k.p[l])
-        end
-        r[i] = exp(-sum)
-    end
+    r = [let
+             sum = zero(eltype(k.x[1]))
+             for l in 1:d
+                 sum = sum + k.theta[l] * norm(val[l] - k.x[i][l])^(k.p[l])
+             end
+             exp(-sum)
+         end
+         for i in 1:n]
+
     one = ones(eltype(k.x[1]), n, 1)
     one_t = one'
     a = r' * k.inverse_of_R * r
-    a = a[1]
     b = one_t * k.inverse_of_R * one
-    b = b[1]
-    mean_squared_error = k.sigma * (1 - a + (1 - a)^2 / (b))
+
+    mean_squared_error = k.sigma * (1 - a[1] + (1 - a[1])^2 / b[1])
     return sqrt(abs(mean_squared_error))
 end
 
@@ -56,38 +58,27 @@ end
 Gives the current estimate for 'val' with respect to the Kriging object k.
 """
 function (k::Kriging)(val::Number)
-    phi = z -> exp(-(abs(z))^k.p)
     n = length(k.x)
-    prediction = zero(eltype(k.x[1]))
-    for i in 1:n
-        prediction = prediction + k.b[i] * phi(val - k.x[i])
-    end
-    prediction = k.mu + prediction
-    return prediction
+    return k.mu + sum(k.b[i] * exp(-sum(k.theta * abs(val - k.x[i])^k.p)) for i in 1:n)
 end
 
 """
     Returns sqrt of expected mean_squared_error error at the point.
 """
 function std_error_at_point(k::Kriging, val::Number)
-    phi(z) = exp(-(abs(z))^k.p)
     n = length(k.x)
-    r = zeros(eltype(k.x[1]), n, 1)
-    @inbounds for i in 1:n
-        r[i] = phi(val - k.x[i])
-    end
-    one = ones(eltype(k.x[1]), n, 1)
+    r = [exp(-k.theta * abs(val - k.x[i])^k.p) for i in 1:n]
+    one = ones(eltype(k.x), n)
     one_t = one'
     a = r' * k.inverse_of_R * r
-    a = a[1]
     b = one_t * k.inverse_of_R * one
-    b = b[1]
-    mean_squared_error = k.sigma * (1 - a + (1 - a)^2 / (b))
+
+    mean_squared_error = k.sigma * (1 - a[1] + (1 - a[1])^2 / b[1])
     return sqrt(abs(mean_squared_error))
 end
 
 """
-    Kriging(x,y,lb::Number,ub::Number;p::Number=1.0)
+    Kriging(x, y, lb::Number, ub::Number; p::Number=2.0, theta::Number = 0.5/var(x))
 
 Constructor for type Kriging.
 
@@ -95,26 +86,56 @@ Constructor for type Kriging.
 -(x,y): sampled points
 -p: value between 0 and 2 modelling the
    smoothness of the function being approximated, 0-> rough  2-> C^infinity
+- theta: value > 0 modeling how much the function is changing in the i-th variable.
 """
-function Kriging(x, y, lb::Number, ub::Number; p = 1.0, theta = 1.0)
+function Kriging(x, y, lb::Number, ub::Number; p = 2.0,
+                 theta = 0.5 / max(1e-6 * abs(ub - lb), std(x))^p)
     if length(x) != length(unique(x))
         println("There exists a repetion in the samples, cannot build Kriging.")
         return
     end
+
+    if p > 2.0 || p < 0.0
+        throw(ArgumentError("Hyperparameter p must be between 0 and 2! Got: $p."))
+    end
+
+    if theta ≤ 0
+        throw(ArgumentError("Hyperparameter theta must be positive! Got: $theta"))
+    end
+
     mu, b, sigma, inverse_of_R = _calc_kriging_coeffs(x, y, p, theta)
     Kriging(x, y, lb, ub, p, theta, mu, b, sigma, inverse_of_R)
 end
 
 function _calc_kriging_coeffs(x, y, p::Number, theta::Number)
     n = length(x)
-    R = zeros(eltype(x[1]), n, n)
 
-    R = [exp(-theta * abs(x[i] - x[j])^p)
-         for j in 1:n, i in 1:n]
+    R = [exp(-theta * abs(x[i] - x[j])^p) for i in 1:n, j in 1:n]
+
+    # Estimate nugget based on maximum allowed condition number
+    # This regularizes R to allow for points being close to each other without R becoming
+    # singular, at the cost of slightly relaxing the interpolation condition
+    # Derived from "An analytic comparison of regularization methods for Gaussian Processes"
+    # by Mohammadi et al (https://arxiv.org/pdf/1602.00853.pdf)
+    λ = eigen(R).values
+    λmax = λ[end]
+    λmin = λ[1]
+
+    κmax = 1e8
+    λdiff = λmax - κmax * λmin
+    if λdiff ≥ 0
+        nugget = λdiff / (κmax - 1)
+    else
+        nugget = 0.0
+    end
 
-    one = ones(eltype(x[1]), n, 1)
+    one = ones(eltype(x[1]), n)
     one_t = one'
+
+    R = R + Diagonal(nugget * one)
+
     inverse_of_R = inv(R)
+
     mu = (one_t * inverse_of_R * y) / (one_t * inverse_of_R * one)
     b = inverse_of_R * (y - one * mu)
     sigma = ((y - one * mu)' * inverse_of_R * (y - one * mu)) / n
@@ -127,17 +148,30 @@ end
 Constructor for Kriging surrogate.
 
 - (x,y): sampled points
-- p: array of values 0<=p<2 modeling the
+- p: array of values 0<=p<=2 modeling the
      smoothness of the function being approximated in the i-th variable.
      low p -> rough, high p -> smooth
 - theta: array of values > 0 modeling how much the function is
           changing in the i-th variable.
 """
-function Kriging(x, y, lb, ub; p = collect(one.(x[1])), theta = collect(one.(x[1])))
+function Kriging(x, y, lb, ub; p = 2 .* collect(one.(x[1])),
+                 theta = [0.5 / max(1e-6 * norm(ub .- lb), std(x_i[i] for x_i in x))^p[i]
+                          for i in 1:length(x[1])])
     if length(x) != length(unique(x))
         println("There exists a repetition in the samples, cannot build Kriging.")
         return
     end
+
+    for i in 1:length(x[1])
+        if p[i] > 2.0 || p[i] < 0.0
+            throw(ArgumentError("All p must be between 0 and 2! Got: $p."))
+        end
+
+        if theta[i] ≤ 0.0
+            throw(ArgumentError("All theta must be positive! Got: $theta."))
+        end
+    end
+
     mu, b, sigma, inverse_of_R = _calc_kriging_coeffs(x, y, p, theta)
     Kriging(x, y, lb, ub, p, theta, mu, b, sigma, inverse_of_R)
 end
@@ -155,8 +189,28 @@ function _calc_kriging_coeffs(x, y, p, theta)
          end
          for j in 1:n, i in 1:n]
 
-    one = ones(n, 1)
+    # Estimate nugget based on maximum allowed condition number
+    # This regularizes R to allow for points being close to each other without R becoming
+    # singular, at the cost of slightly relaxing the interpolation condition
+    # Derived from "An analytic comparison of regularization methods for Gaussian Processes"
+    # by Mohammadi et al (https://arxiv.org/pdf/1602.00853.pdf)
+    λ = eigen(R).values
+
+    λmax = λ[end]
+    λmin = λ[1]
+
+    κmax = 1e8
+    λdiff = λmax - κmax * λmin
+    if λdiff ≥ 0
+        nugget = λdiff / (κmax - 1)
+    else
+        nugget = 0.0
+    end
+
+    one = ones(eltype(x[1]), n)
     one_t = one'
+
+    R = R + Diagonal(nugget * one[:, 1])
     inverse_of_R = inv(R)
 
     mu = (one_t * inverse_of_R * y) / (one_t * inverse_of_R * one)
diff --git a/test/Kriging.jl b/test/Kriging.jl
index 7d5527cd1..e2b44934f 100644
--- a/test/Kriging.jl
+++ b/test/Kriging.jl
@@ -1,6 +1,8 @@
 using LinearAlgebra
 using Surrogates
 using Test
+using Statistics
+
 #1D
 lb = 0.0
 ub = 10.0
@@ -8,17 +10,30 @@ f = x -> log(x) * exp(x)
 x = sample(5, lb, ub, SobolSample())
 y = f.(x)
 my_p = 1.9
-my_k = Kriging(x, y, lb, ub, p = 2.3)
+
+# Check input validation for 1D Kriging
+@test_throws ArgumentError my_k=Kriging(x, y, lb, ub, p = -1.0)
+@test_throws ArgumentError my_k=Kriging(x, y, lb, ub, p = 3.0)
+@test_throws ArgumentError my_k=Kriging(x, y, lb, ub, theta = -2.0)
+
+my_k = Kriging(x, y, lb, ub, p = my_p)
+@test my_k.theta ≈ 0.5 * std(x)^(-my_p)
+
 add_point!(my_k, 4.0, 75.68)
 add_point!(my_k, [5.0, 6.0], [238.86, 722.84])
 pred = my_k(5.5)
 
+@test 238.86 ≤ pred ≤ 722.84
+@test my_k(5.0) ≈ 238.86
+@test std_error_at_point(my_k, 5.0) < 1e-3 * my_k(5.0)
+
 #WITHOUT ADD POINT
 x = [1.0, 2.0, 3.0]
 y = [4.0, 5.0, 6.0]
 my_p = 1.3
 my_k = Kriging(x, y, lb, ub, p = my_p)
 est = my_k(1.0)
+@test est == 4.0
 std_err = std_error_at_point(my_k, 1.0)
 @test std_err < 10^(-6)
 
@@ -42,7 +57,7 @@ est = my_k(4.0)
 std_err = std_error_at_point(my_k, 4.0)
 @test std_err < 10^(-6)
 
-#Testin kwargs 1D
+#Testing kwargs 1D
 kwar_krig = Kriging(x, y, lb, ub);
 
 #ND
@@ -90,5 +105,14 @@ est = my_k((10.0, 11.0, 12.0))
 std_err = std_error_at_point(my_k, (10.0, 11.0, 12.0))
 @test std_err < 10^(-6)
 
-#test kwargs ND
+#test kwargs ND (hyperparameter initialization)
 kwarg_krig_ND = Kriging(x, y, lb, ub)
+
+@test_throws ArgumentError Kriging(x, y, lb, ub, p = 3 * my_p)
+@test_throws ArgumentError Kriging(x, y, lb, ub, p = -my_p)
+@test_throws ArgumentError Kriging(x, y, lb, ub, theta = -my_theta)
+
+d = length(x[3])
+
+@test all(==(2.0), kwarg_krig_ND.p)
+@test all(kwarg_krig_ND.theta .≈ [0.5 / var(x_i[ℓ] for x_i in x) for ℓ in 1:3])
diff --git a/test/optimization.jl b/test/optimization.jl
index 383b24727..149164722 100644
--- a/test/optimization.jl
+++ b/test/optimization.jl
@@ -1,8 +1,5 @@
 using Surrogates
 using LinearAlgebra
-using Flux
-using Flux: @epochs
-using PolyChaos
 
 #######SRBF############
 ##### 1D #####
@@ -21,18 +18,32 @@ a = 2
 b = 6
 
 #Using Kriging
-my_k_SRBF1 = Kriging(x, y, lb, ub)
-surrogate_optimize(objective_function, SRBF(), a, b, my_k_SRBF1, UniformSample())
+
+x = [2.5, 4.0, 6.0]
+y = [6.0, 9.0, 13.0]
+my_k_SRBF1 = Kriging(x, y, lb, ub; p)
+xstar, fstar = surrogate_optimize(objective_function, SRBF(), a, b, my_k_SRBF1,
+                                  UniformSample())
 
 #Using RadialBasis
+
+x = [2.5, 4.0, 6.0]
+y = [6.0, 9.0, 13.0]
 my_rad_SRBF1 = RadialBasis(x, y, a, b, rad = linearRadial())
-surrogate_optimize(objective_function, SRBF(), a, b, my_rad_SRBF1, UniformSample())
+(xstar, fstar) = surrogate_optimize(objective_function, SRBF(), a, b, my_rad_SRBF1,
+                                    UniformSample())
 
+x = [2.5, 4.0, 6.0]
+y = [6.0, 9.0, 13.0]
 my_wend_1d = Wendland(x, y, lb, ub)
-surrogate_optimize(objective_function, SRBF(), a, b, my_wend_1d, UniformSample())
+xstar, fstar = surrogate_optimize(objective_function, SRBF(), a, b, my_wend_1d,
+                                  UniformSample())
 
+x = [2.5, 4.0, 6.0]
+y = [6.0, 9.0, 13.0]
 my_earth1d = EarthSurrogate(x, y, lb, ub)
-surrogate_optimize(objective_function, SRBF(), a, b, my_earth1d, LowDiscrepancySample(2))
+xstar, fstar = surrogate_optimize(objective_function, SRBF(), a, b, my_earth1d,
+                                  LowDiscrepancySample(2))
 
 ##### ND #####
 objective_function_ND = z -> 3 * norm(z) + 1
@@ -40,8 +51,6 @@ lb = [1.0, 1.0]
 ub = [6.0, 6.0]
 x = sample(5, lb, ub, SobolSample())
 y = objective_function_ND.(x)
-p = [1.5, 1.5]
-theta = [1.0, 1.0]
 
 #Kriging
 
@@ -66,6 +75,7 @@ alpha = [2.0, 2.0]
 n = 4
 my_loba_ND = LobachevskySurrogate(x, y, lb, ub)
 surrogate_optimize(objective_function_ND, SRBF(), lb, ub, my_loba_ND, UniformSample())
+
 #Linear
 lb = [1.0, 1.0]
 ub = [6.0, 6.0]
@@ -191,11 +201,9 @@ lb = [-1.0, x2_constraint, -1.0]
 ub = [6.0, x2_constraint, 6.0]
 x = sample(5, lb, ub, sampler)
 y = objective_function_section.(x)
-p = [1.2, 1.2, 1.2]
-theta = [2.0, 2.0, 2.0]
 
 #Kriging
-my_k_EIN_section = Kriging(x, y, lb, ub; p = p)
+my_k_EIN_section = Kriging(x, y, lb, ub)
 # Constrain our sampling to the plane where x[2] = 1
 surrogate_optimize(objective_function_section, EI(), lb, ub, my_k_EIN_section, sampler)