Class

Class to save all information about the SDC process

createSdcObj(
  dat,
  keyVars,
  numVars = NULL,
  pramVars = NULL,
  ghostVars = NULL,
  weightVar = NULL,
  hhId = NULL,
  strataVar = NULL,
  sensibleVar = NULL,
  excludeVars = NULL,
  options = NULL,
  seed = NULL,
  randomizeRecords = FALSE,
  alpha = 1
)

undolast(object)

strataVar(object) <- value

# S4 method for class 'sdcMicroObj,characterOrNULL'
strataVar(object) <- value

Arguments

dat: The microdata set. A numeric matrix or data frame containing the data.
keyVars: Indices or names of categorical key variables. They must, of course, match with the columns of ‘dat’.
numVars: Index or names of continuous key variables.
pramVars: Indices or names of categorical variables considered to be pramed.
ghostVars: if specified a list which each element being a list of exactly two elements. The first element must be a character vector specifying exactly one variable name that was also specified as a categorical key variable (keyVars), while the second element is a character vector of valid variable names (that must not be listed as keyVars). If localSuppression or kAnon was applied, the resulting suppression pattern for each key-variable is transferred to the depending variables.
weightVar: Indices or name determining the vector of sampling weights.
hhId: Index or name of the cluster ID (if available).
strataVar: Indices or names of stratification variables.
sensibleVar: Indices or names of sensible variables (for l-diversity)
excludeVars: which variables of dat should not be included in result-object? Users may specify a vector of variable-names available in dat that were not specified in either keyVars, numVars, pramVars, ghostVars, hhId, strataVar or sensibleVar.
options: additional options (if specified, a list must be used as input)
seed: (numeric) number specifiying the seed which will be set to allow for reproducablity. The number will be rounded and saved as element seed in slot options.
randomizeRecords: (logical) if TRUE, the order of observations in the input microdata set will be randomized.
alpha: numeric between 0 and 1 specifying the fraction on how much keys containing NAs should contribute to the frequency calculation which is also crucial for risk-estimation.
object: a sdcMicroObj-class object
value: NULL or a character vector of length 1 specifying a valid variable name

Value

a sdcMicroObj-class object

an object of class sdcMicroObj with modified slot @strataVar

Objects from the Class

Objects can be created by calls of the form new("sdcMicroObj", ...).

References

Templ, M. and Meindl, B. and Kowarik, A.: Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro, Journal of Statistical Software, 67 (4), 1–36, 2015. doi:10.18637/jss.v067.i04

Author

Bernhard Meindl, Alexander Kowarik, Matthias Templ, Elias Rut

Examples

## we can also specify ghost (linked) variables
## these variables are linked to some categorical key variables
## and have the sampe suppression pattern as the variable that they
## are linked to after \code{\link{localSuppression}} has been applied
data(testdata)
testdata$electcon2 <- testdata$electcon
testdata$electcon3 <- testdata$electcon
testdata$water2 <- testdata$water

keyVars <- c("urbrur","roof","walls","water","electcon","relat","sex")
numVars <- c("expend","income","savings")
w <- "sampling_weight"

## we want to make sure that some variables not used as key-variables
## have the same suppression pattern as variables that have been
## selected as key variables. Thus, we are using 'ghost'-variables.
ghostVars <- list()

## we want variables 'electcon2' and 'electcon3' to be linked
## to key-variable 'electcon'
ghostVars[[1]] <- list()
ghostVars[[1]][[1]] <- "electcon"
ghostVars[[1]][[2]] <- c("electcon2","electcon3")

# \donttest{
## donttest because Examples with CPU time > 2.5 times elapsed time
## we want variable 'water2' to be linked to key-variable 'water'
ghostVars[[2]] <- list()
ghostVars[[2]][[1]] <- "water"
ghostVars[[2]][[2]] <- "water2"

## create the sdcMicroObj
obj <- createSdcObj(testdata, keyVars=keyVars,
  numVars=numVars, w=w, ghostVars=ghostVars)

## apply 3-anonymity to selected key variables
obj <- kAnon(obj, k=3); obj
#> The input dataset consists of 4580 rows and 18 variables.
#>   --> Categorical key variables: urbrur, roof, walls, water, electcon, relat, sex
#>   --> Numerical key variables: expend, income, savings
#>   --> Weight variable: sampling_weight
#>   --> Ghost variable(s) exist
#>     Variable(s) electcon2, electcon3 are linked to key variable electcon
#>     Variable(s) water2 are linked to key variable water
#> ----------------------------------------------------------------------
#> 
#> Information on categorical key variables:
#> 
#> Reported is the number, mean size and size of the smallest category >0 for recoded variables.
#> In parenthesis, the same statistics are shown for the unmodified data.
#> Note: NA (missings) are counted as seperate categories!
#> 
#>  Key Variable Number of categories        Mean size           
#>        <char>               <char> <char>    <char>     <char>
#>        urbrur                    2    (2)  2290.000 (2290.000)
#>          roof                    6    (5)   913.200  (916.000)
#>         walls                    4    (3)  1525.333 (1526.667)
#>         water                    9    (8)   567.500  (572.500)
#>      electcon                    4    (3)  1524.667 (1526.667)
#>         relat                    9    (9)   543.875  (508.889)
#>           sex                    2    (2)  2290.000 (2290.000)
#>  Size of smallest (>0)       
#>                 <char> <char>
#>                    646  (646)
#>                     15   (16)
#>                     46   (50)
#>                     25   (26)
#>                    103  (107)
#>                      2    (1)
#>                   2284 (2284)
#> ----------------------------------------------------------------------
#> 
#> Infos on 2/3-Anonymity:
#> 
#> Number of observations violating
#>   - 2-anonymity: 0 (0.000%) | in original data: 157 (3.428%)
#>   - 3-anonymity: 0 (0.000%) | in original data: 281 (6.135%)
#>   - 5-anonymity: 100 (2.183%) | in original data: 458 (10.000%)
#> 
#> ----------------------------------------------------------------------
#> 
#> Numerical key variables: expend, income, savings
#> 
#> Disclosure risk (~100.00% in original data):
#>   modified data: [0.00%; 100.00%]
#> 
#> Current Information Loss in modified data (0.00% in original data):
#>   IL1: 0.00
#>   Difference of Eigenvalues: 0.000%
#> ----------------------------------------------------------------------
#> 
#> Local suppression:
#>    KeyVar      | Suppressions (#)      | Suppressions (%)
#>    <char> <char>            <int> <char>           <char>
#>    urbrur      |                0      |            0.000
#>      roof      |               14      |            0.306
#>     walls      |                4      |            0.087
#>     water      |               40      |            0.873
#>  electcon      |                6      |            0.131
#>     relat      |              229      |            5.000
#>       sex      |                0      |            0.000
#> ----------------------------------------------------------------------
#> 

## check, if the suppression patterns are identical
manipGhostVars <- get.sdcMicroObj(obj, "manipGhostVars")
manipKeyVars <- get.sdcMicroObj(obj, "manipKeyVars")
all(is.na(manipKeyVars$electcon) == is.na(manipGhostVars$electcon2))
#> [1] TRUE
all(is.na(manipKeyVars$electcon) == is.na(manipGhostVars$electcon3))
#> [1] TRUE
all(is.na(manipKeyVars$water) == is.na(manipGhostVars$water2))
#> [1] TRUE

## exclude some variables
obj <- createSdcObj(testdata, keyVars=c("urbrur","roof","walls"), numVars="savings",
   weightVar=w, excludeVars=c("relat","electcon","hhcivil","ori_hid","expend"))
colnames(get.sdcMicroObj(obj, "origData"))
#>  [1] "urbrur"            "roof"              "walls"            
#>  [4] "water"             "sex"               "age"              
#>  [7] "income"            "savings"           "sampling_weight"  
#> [10] "household_weights" "electcon2"         "electcon3"        
#> [13] "water2"           
# }