1 Introduction

The Normality Transformation via Optimized Skewness and Kurtosis (OSKT) is a normality method that simultaneously evaluates deviations in skewness and kurtosis of non-normal data.

2 Required Packages

The recent version of the package osktnorm is installed from CRAN. If the package osktnorm has already been installed, load it into R working environment by using the following command:

library(osktnorm)

3 Normality Transformation Using OSKT

In the following code snippet, a right-skewed distribution with 300 observations is generated using rlnorm of R and then normalized using OSKT. For this purpose, the osktfast function is applied by simply passing the original observation vector to be transformed, and the results are stored in the object res_oskt.

set.seed(12)
x_orig <- rlnorm(300, mean=0, sd=0.5) # Generate right-skewed data

# Apply OSKT normality
res_oskt <- osktfast(x_orig) 

x_transformed <- res_oskt$transformed
head(x_transformed, 5)
  [1] -1.802405  1.399281 -1.250515 -1.210836 -2.307431

g_star <- res_oskt$g
h_star <- res_oskt$h
A2 <- res_oskt$value

cat("Optimized skewness: ", g_star, "\n")
  Optimized skewness:  -0.5909241
cat("Optimized kurtosis: ", h_star, "\n")
  Optimized kurtosis:  0.07987875
cat("Anderson-Darling statistic at the optimum: ", A2, "\n")
  Anderson-Darling statistic at the optimum:  0.1056021

The code snippet below visualizes the original and normalized observations using histograms and density plots for comparison purposes.

oldpar <- par(no.readonly = TRUE)
breaks <- pretty(range(c(x_orig, x_transformed)), n = 25)
h_orig  <- hist(x_orig, breaks = breaks, plot = FALSE)
h_trans <- hist(x_transformed, breaks = breaks, plot = FALSE)

d_orig  <- density(x_orig); d_trans <- density(x_transformed)

ymax <- max(c(h_orig$density, h_trans$density, d_orig$y, d_trans$y, dnorm(0)))
hist(x_orig, breaks = breaks, freq = FALSE, ylim = c(0, ymax * 1.05), 
     col = rgb(0.2, 0.4, 0.8, 0.4), border = "white", 
     main = "Before and After OSKT Transformation", xlab = "Value")
lines(d_orig, col = "blue", lwd = 2)

hist(x_transformed, breaks = breaks, freq = FALSE,
     col = rgb(0.8, 0.3, 0.3, 0.4), border = "white", add = TRUE)
lines(d_trans, col = "red", lwd = 2)

curve(dnorm(x), add = TRUE, lwd = 2, lty = 2, col = "black") # Standard normal reference

legend("topleft",
   legend = c("Original", "Transformed", "Original Density", "OSKT Density", "Standard Normal"),
   col = c(rgb(0.2,0.4,0.8,0.6), rgb(0.8,0.3,0.3,0.6), "blue", "red", "black"),
   lwd = c(10, 10, 2, 2, 2), lty = c(1, 1, 1, 1, 2), bty = "n")


par(oldpar)

4 Back Transformation: Recovering Original Values

Back-transformation can be performed using the backosktfast function, which uses the Brent–Dekker algorithm for efficiency.

X_mean <- mean(x_orig)
X_sd   <- sd(x_orig)

res_back <- backosktfast(
              Z = x_transformed,
              X_mean = X_mean, X_sd = X_sd,
              g = g_star, h = h_star,
              method = "brent")

x_recovered <- res_back$X_orig
head(x_recovered, 5)
  [1] 0.4759235 2.2021046 0.6189750 0.6304848 0.3670768

oldpar <- par(no.readonly = TRUE)
breaks <- pretty(range(c(x_orig, x_transformed, x_recovered)), n = 30)
hist(x_orig, breaks = breaks, freq = FALSE, col = rgb(0.2, 0.4, 0.9, 0.4),
  border = "white", main="OSKT Transformation & Back Transformation", xlab="Value")
hist(x_transformed, breaks = breaks, freq = FALSE, col = rgb(0.8, 0.3, 0.3, 0.4), 
  border = "white", add=TRUE)
hist(x_recovered, breaks = breaks, freq = FALSE, col = rgb(0.2,0.8,0.2,0.4), 
  border = "white", add=TRUE)

legend("topleft", legend = c("Original","Transformed","Back-transformed"),
       fill = c(rgb(0.2,0.4,0.8,0.4), rgb(0.8,0.3,0.3,0.4), rgb(0.2,0.8,0.2,0.4)))

       
(all.equal(x_orig, x_recovered, tolerance = 1e-6)) 
  [1] "Mean relative difference: 0.0006286436"
par(oldpar)

5 Back-transformation Diagnostics

Diagnostic metrics compare original and recovered values to ensure mathematical inversion accuracy.

ok <- is.finite(x_orig) & is.finite(x_recovered) 
xo <- x_orig[ok]
xr <- x_recovered[ok]
err <- xr - xo

MAE  <- mean(abs(err))
RMSE <- sqrt(mean(err^2))
COR  <- cor(xo, xr)

back_stats <- data.frame(RMSE = RMSE, MAE = MAE, Correlation= COR, R2 = COR^2)
round(t(back_stats), 8)
                    [,1]
  RMSE        0.00089459
  MAE         0.00069680
  Correlation 1.00000000
  R2          1.00000000

6 Comparison of Normality Methods

Below, we generate a skewed variable using ghdist and compare OSKT with Box-Cox (BC) and Yeo-Johnson (YJ).

set.seed(12)
x_orig <- groupcompare::ghdist(n=300, A=0, B=1, g=-0.49, h=0)

x_bc <- osktnorm::boxcox(x_orig, makepositive=TRUE)$transformed 
x_yj <- osktnorm::yeojohnson(x_orig)$transformed  
x_oskt <- osktfast(x_orig)$transformed 

get_stats <- function(x) {
  x <- x[is.finite(x)]
  c(
    Skew = mean((x - mean(x))^3) / sd(x)^3,
    Kurt = mean((x - mean(x))^4) / sd(x)^4 - 3,
    SW   = shapiro.test(x)$p.value,
    CVM  = cvmtest(x)$p.value,
    PPM  = unname(pearsonp(x)$statistic)
  )
}

pval_table <- rbind(ORG = get_stats(x_orig), BC = get_stats(x_bc), YJ = get_stats(x_yj), OSKT = get_stats(x_oskt))
as.data.frame(round(pval_table, 4))
          Skew    Kurt     SW    CVM  PPM
  ORG  -1.8642  5.9759 0.0000 0.0001 9.24
  BC   -0.2397 -0.5635 0.0047 0.1551 0.82
  YJ   -0.0454 -0.3109 0.1257 0.3844 0.71
  OSKT -0.2130  0.0118 0.0631 0.9100 0.56

options(oldopts)

OSKT: Normalization via Optimized Skewness and Kurtosis

Zeynel Cebeci