In God we trust; all others must bring data
MJJ08398
数据挖掘
In God we trust; all others must bring data
base
:read.csv
,read.table
readr
:readxl
data.table
dplyr
RODBC
base::read.csv
和base::read.table
包含在base包中常用的读取格式化数据的函数。base::read.csv
是base::read.table
的sep=','
的特例。
read.table(file,header = FALSE,
sep = "",
quote = "\"'", #
dec = ".", #
numerals = c("allow.loss", "warn.loss", "no.loss"),
row.names, #
col.names, #
as.is = !stringsAsFactors,skip = 0, check.names = TRUE, fill = !blank.lines.skip,
na.strings = "NA",
colClasses = NA, #
nrows = -1,
strip.white = FALSE, blank.lines.skip = TRUE,
comment.char = "#", #
allowEscapes = FALSE, flush = FALSE,
stringsAsFactors = default.stringsAsFactors(),
fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
readr
readr
做了统一的接口,并且省略了一些不必要的参数。datasource
函数,统一了数据来源read_csv(file,
col_names = TRUE,
col_types = NULL, #使用col_* 系列函数指定列的类型
locale = default_locale(),
na = c("", "NA"),
comment = "",
trim_ws = TRUE, #去除空白
skip = 0,
n_max = -1,
progress = interactive())
readr
函数的使用举例1require(readr)
## Loading required package: readr
dr1 <- datasource("a,b,c\n1,2,3")
dr2 <- datasource(system.file("extdata/mtcars.csv", package = "readr"))
head(read_csv(dr1),n = 2)
## a b c
## 1 a b c
## 2 1 2 3
head(read_csv(dr2),n=2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 mpg cyl disp hp drat wt qsec vs am gear carb
## 2 21 6 160 110 3.9 2.62 16.46 0 1 4 4
readr
函数的使用举例2require(readr)
dr1 <- datasource("2015-01-02,1,a\n2015-02-03,2,b")
dat <- read_csv(dr1,col_names = FALSE,
col_types = list(col_date(),
col_integer(),
col_character()))
dat$X1[1]-dat$X1[2]
## Time difference of -32 days
readxl
方便地读取Excel数据
data.table::fread
能够快速地读取数据,数据为data.table
格式的数据,可以快速地筛选处理。需要注意其[
操作符。
require(data.table)
DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9)
DT[y %between% c(3,6)] # y在3-6之间的行
DT[2] # 第二行
DT[,v] # v列
DT[,list(v)] # v列,注意和DT[,v]的区别
DT[2:3,sum(v)] # 第2-3行的v列求和
DT[2:5,cat(v,"\n")] #
DT[c(FALSE,TRUE)] # 偶数行
DT[,2,with=FALSE] # 比较 DT[,2,with=TRUE]
setkey(DT,x) # 设置x列位key
DT['a'] #取出DT中x为a的行
DT[,sum(v),by=x] # 按照x中值进行group
X = data.table(c("b","c"),foo=c(4,2))
X
DT[X] # 使用DT中的x进行join
dplyr
中连接数据库的函数时间序列函数的转换
as.Date
as.POSIXct
xts
以及相关的包# cctCalls的列为count dateBin
cctCalls$dateBin <- as.POSIXct(x = cctCalls$dateBin)
full <- seq(from = as.POSIXct('2015-01-01'),
by='1 hour',
to = as.POSIXct('2016-05-30'))
cctCalls2 <- data.frame(Date=full,
value=with(cctCalls, count[match(full, dateBin)]))
rownames(cctCalls2) <- cctCalls2$dateBin
cctCalls2 <- cctCalls2[,1:2]
cctCalls2TS <- as.xts(as.matrix(cctCalls2))