-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathread_sas.R
53 lines (51 loc) · 2.28 KB
/
read_sas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#' @title Read in SAS datasets in .sas7bdat format into Spark by using the spark-sas7bdat Spark package.
#' @description Read in SAS datasets in .sas7bdat format into Spark by using the spark-sas7bdat Spark package.
#' @param sc Connection to Spark local instance or remote cluster. See the example
#' @param path full path to the SAS file either on HDFS (hdfs://), S3 (s3n://), as well as the local file system (file://).
#' Mark that files on the local file system need to be specified using the full path.
#' @param table character string with the name of the Spark table where the SAS dataset will be put into
#' @return an object of class \code{tbl_spark}, which is a reference to a Spark DataFrame based on which
#' dplyr functions can be executed. See \url{https://github.com/sparklyr/sparklyr}
#' @export
#' @seealso \code{\link[sparklyr]{spark_connect}}, \code{\link[sparklyr]{sdf_register}}
#' @references \url{https://spark-packages.org/package/saurfang/spark-sas7bdat}, \url{https://github.com/saurfang/spark-sas7bdat}, \url{https://github.com/sparklyr/sparklyr}
#' @examples
#' \dontrun{
#' ## If you haven't got a Spark cluster, you can install Spark locally like this
#' library(sparklyr)
#' spark_install(version = "2.0.1")
#'
#' ## Define the SAS .sas7bdat file, connect to the Spark cluster to read + process the data
#' myfile <- system.file("extdata", "iris.sas7bdat", package = "spark.sas7bdat")
#' myfile
#'
#' library(spark.sas7bdat)
#' sc <- spark_connect(master = "local")
#' x <- spark_read_sas(sc, path = myfile, table = "sas_example")
#' x
#'
#' library(dplyr)
#' x %>% group_by(Species) %>%
#' summarise(count = n(), length = mean(Sepal_Length), width = mean(Sepal_Width))
#' }
spark_read_sas <- function(sc, path, table){
if(missing(table)){
stop("Please provide the name of the Spark table where to store the SAS file into")
}
x <- hive_context(sc)
x <- invoke(x, "read")
x <- invoke(x, "format", "com.github.saurfang.sas.spark")
x <- invoke(x, "load", path)
sdf <- sdf_register(x, name = table)
sdf
}
spark_dependencies <- function(spark_version, scala_version, ...) {
sparklyr::spark_dependency(
packages = c(
sprintf("saurfang:spark-sas7bdat:2.0.0-s_%s", scala_version)
)
)
}
.onLoad <- function(libname, pkgname) {
sparklyr::register_extension(pkgname)
}