日期：2021.10.17

链接：http://www.yufan.site/study/semester1/r/r4.html

内容：R数据科学

参考书

B1：R数据科学 [新西兰]哈德利.威克姆，[美]加勒特.格罗勒芒德

安装与加载包

# “#” 符号后内容表示注释 

if(FALSE)  #条件执行 #B1P190 # TRUE执行 / FALSE不执行
{
  install.packages("tidyverse")   #安装R包，仅需安装一次 #B1P4
  install.packages("nycflights13")
  install.packages("dplyr")
  install.packages("lubridate")
  install.packages("modelr")
  install.packages("stringr")
}


library(tidyverse)  # 加载R包，每次开始新会话需加载 #B1P4

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(nycflights13)
library(dplyr)
library(lubridate)

## 
## 载入程辑包：'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(modelr)
library(stringr)

options(na.action = na.warn)
#上述语句表示，所有支持该参数的模块
#将不会自动丢弃NA值，将会给出Warning信息

实现基础模型

#模型 #第17章  #B1P243

#回归分析 #B2P317
#如果能取得数据，我们就能用统计方法来建立一个表示变量之间相互关系的方程，
#我们将这一统计方法称为回归分分析
 
#因变量  #B2P317 
#在回归术语中，我们把被预测的变量称为因变量（dependent variable）。
#统计符号y

#自变量  #B2P317
#把用来预测因变量值的一个或多个变量称为自变量（independent variable）
#统计符号x

#简单线性回归模型 （一元线性回归模型） #B2P318
#最简单类型的回归分析，它只包含一个自变量和一个因变量，两者之间的关系
#可以用一条直线近似表示。

#多元线性回归模型  #B2P361
#多元回归分析是研究因变量y如何依赖两个或两个以上自变量的问题。
#在一般情形下，我们将用p表示自变量的数目。

#残差  #B1P252 #B2P325
#对于样本的第i次观测值，因变量的观测值y和因变量的预测值y之间的离差
#称为第i个残差

 
#以下三者涉及计算及公式  #参考 教材 商务与经济统计
#最小二乘法 #B2P320
#模型的的假定  #B2P329
#估计的回归方程 #B2P319



#所有的模型都是错误的，但有些是有用的  #B1P243





###对模拟数据集sim1求一元线性回归方程
#共使用三种方法：

###R语言得到模型的方法一：
#暴力进行，随机生成大量直线，从中选择最优直线

##方法一：
#用随机生成模型（直线）的方式求最接近直线  #B1P243


#计算过程
#1.产生250条直线
#2.对于每一条直线，计算sim1的点的直线的竖直距离的平方和
#3.产生250个平方和
#4.比较250个平方和，筛选出最小的那一个，对应的直线即是我们要求的直线


# view(sim1)
# view(models)


line_cnt <- 250   #生成线的条数

line <- tibble(               #建立数据框，保存线的斜率，截距，二乘值等信息
  k = runif(line_cnt,-5,5),   #k列，表示斜率
  b = runif(line_cnt,-20,40), #b列，表示截距
  d = 0                       #二乘值
)

for(i in 1:line_cnt)          #对每条随机线计算二乘值
{
  y_line <- line$k[i]*sim1$x + line$b[i]   #根据x及直线方程算出y（y_line） #y=kx+b 
  d <- sim1$y-y_line                #点实际的y值与计算y值的离差       
  line$d[i]<-sum(d^2)               #得到二乘值
}

line <- arrange(line,d)       #对所有随机直线以二乘值进行降序排序

# view(line)

ggplot(sim1,aes(x,y)) +
  geom_abline(aes(intercept = b[1],slope = k[1]),  #降序后第一条直线即为最优直线
              data = line,
              alpha=1/4) +
  geom_point()

###R语言得到模型的方法二：
#由最小二乘值计算方法可以直接计算得到直线


##方法二：
#直接使用R语言函数进行拟合

#lm函数#B1P249
sim1_mod <- lm(y~x,data=sim1)  #lm()函数将生成y=ax+b形式的函数即模型
coef(sim1_mod) #coefficients()函数用于提取模型系数

## (Intercept)           x 
##    4.220822    2.051533

summary(sim1_mod)  #summary()用于提取模型资料

## 
## Call:
## lm(formula = y ~ x, data = sim1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1469 -1.5197  0.1331  1.4670  4.6516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.2208     0.8688   4.858 4.09e-05 ***
## x             2.0515     0.1400  14.651 1.17e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.203 on 28 degrees of freedom
## Multiple R-squared:  0.8846, Adjusted R-squared:  0.8805 
## F-statistic: 214.7 on 1 and 28 DF,  p-value: 1.173e-14

#提取的全部结果

# Residuals:   
#   Min      1Q  Median      3Q     Max 
# -4.1469 -1.5197  0.1331  1.4670  4.6516 
# 
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   4.2208     0.8688   4.858 4.09e-05 ***
#   x             2.0515     0.1400  14.651 1.17e-14 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 2.203 on 28 degrees of freedom
# Multiple R-squared:  0.8846,  Adjusted R-squared:  0.8805 
# F-statistic: 214.7 on 1 and 28 DF,  p-value: 1.173e-14

#即上述计算得到直线:y=2.0515x+4.2208


#对结果的说明
# 残差 
#   最小   一四分位数  中值    三四分位数      最大   #B2P69
# -4.1469    -1.5197  0.1331     1.4670      4.6516 
# 
# 系数
#           估计值     标准误差    T值      P值   
# (截距)     4.2208     0.8688     4.858   4.09e-05 ***
#   斜率     2.0515     0.1400    14.651   1.17e-14 ***
#   ---
#   以星号多少表示显著性等级:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# 残差的标准差: 2.203 on 28 degrees of freedom （自由度为n-2即28）
# 拟合优度:  0.8846,    修正的拟合优度:  0.8805 
# F统计量（F检验）: 214.7 on 1 and 28 DF,  P值: 1.173e-14


#中位数  #B2P55
#所有数据按序排列后，中间的数

#四分位数  #B2P58
#将数据划为四部分，每一部分包含大约25%数据

#一四分位数   #B2P58
#第25%分位数

#三四分位数   #B2P58
#第75%分位数

#标准误差 #B2P331

#T检验 #B2P331
# T检验是检验解释变量的显著性的
#F检验 #B2P333
# F检验是检验方程整体显著性的

# 拟合优度是查看方程拟合程度的


#老师说明我们常用的显著性等级划分：
#"***":0.01/"**":0.05/"*":0.1




###R语言得到模型的方法三：
#使用优化方法进行求解

##方法三：
#直接使用R语言优化函数求出直线


#先使用例子进行
#求以下式子取得最大值时的x1和x2
# max(u(x1,x2)) = x1^0.4*x2^0.6
#1*x1+1*x2<=1  
  
#效用，与消费量有关
#效用（Utility）的概念，
#即指消费者在一定时间内消费某种商品或劳务一定数量获得的满足程度。
  
#由条件，势必在x1+x2=1时，x1^0.4*x2^0.6有最大值
#故可以将x2设为1-x1

fr <- function(x)
{
   x1 <- x[1]
  # x2 <- x[2]
  -(x1)^0.4*(1-x1)^0.6  #优化 #一般是往最小值优化，故最前有负号
}

#optimization
#初始值0.001
##optim 搜寻最小值

optim(c(0.001),fr)

## Warning in optim(c(0.001), fr): one-dimensional optimization by Nelder-Mead is unreliable:
## use "Brent" or optimize() directly

## $par
## [1] 0.4001
## 
## $value
## [1] -0.5101698
## 
## $counts
## function gradient 
##       44       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

#求得x1=0.400左右

#上例可通过纯数学方法求解

#对于存在多峰值的函数，可以选择多个初始值反复优化
#从而得到全部峰值


#开始实际求直线
func_line <- function(para)
{
  y_line <- para[1]*sim1$x + para[2]  
  d <- sim1$y-y_line          
  sum(d^2)                    #求直线的最小二乘值，为最小时即为最优直线
}

optim(c(1,2),func_line)

## $par
## [1] 2.051653 4.220338
## 
## $value
## [1] 135.8746
## 
## $counts
## function gradient 
##       71       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

模型构建

#第十八章 模型构建 
#案例

ggplot(diamonds, aes(cut, price)) +   #B1P267
  geom_boxplot()

ggplot(diamonds, aes(color, price)) +  #关于箱线图内容参见：  #B2P71
  geom_boxplot()

ggplot(diamonds, aes(clarity, price)) +
  geom_boxplot()

ggplot(diamonds, aes(carat, price)) +  #获得的点图可看出为曲线
  geom_hex(bins = 50)

#geom_hex() 二进制计数器的六边形热图
#将平面划分为正六边形，计算每个六边形中的个案数，
#然后（默认情况下）将案例数映射到六边形填充。

#移动变量问题
#变量代换
#对于一些结构较为复杂、变元较多的数学问题 ，引入一些新的变量进行代换，
#以简化其结构，从而达到解决问题的目的这种方法叫做变量代换法。

#对重量和价格进行对数转换，以期可以获得线性方程
diamonds2 <- diamonds %>%
  filter(carat <= 2.5) %>%
  mutate(lprice = log2(price), lcarat = log2(carat))   #进行对数转换

ggplot(diamonds2, aes(lcarat, lprice)) +
  geom_hex(bins = 50)                 #获得的点图可看出变为直线

                                      #相对曲线有一定程度简化

#可尝试进一步控制重量获取其他图形
# diamonds2 <- diamonds %>%
#   filter(carat <= 1.5) %>%
#   mutate(lprice = price, lcarat = carat)
#
# ggplot(diamonds2, aes(lcarat, lprice)) +
#   geom_hex(bins = 50)


#使用lm()函数拟合一个模型
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%  #data_grid()添加网格线
  #seq_range()在向量范围内生成序列,序列长度20
  mutate(lcarat = log2(carat)) %>%
  #增加一列保存carat的对数值
  add_predictions(mod_diamond, "lprice") %>%
  #增加预测值
  mutate(price = 2 ^ lprice)
  #因为是按对数计算出预测的lprice，进行指数计算还原出真实值

#view(grid) #查看数据框

#绘图查看
ggplot(diamonds2, aes(carat, price)) +
  geom_hex(bins = 50) +
  geom_line(data = grid, color = "red", size = 1)

#多元线性回归   #更复杂的模型

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond, "lresid")  #向数据框中添加残差列lresid

#view(diamonds2)  #查看数据框

ggplot(diamonds2, aes(lcarat, lresid)) +
  geom_hex(bins = 50)

ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()    #使用残差绘制箱线图

ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()  #B1P270

ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()

#使用三个变量拟合模型
mod_diamond2 <- lm(
  lprice ~ lcarat + color + cut + clarity,
  data = diamonds2
)

grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions(mod_diamond2)

ggplot(grid, aes(cut, pred)) +   
  geom_point()    #模型回归变量少一个（数学原因）

#增加残差
diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
  geom_hex(bins = 50)

#view(diamonds2)  #查看数据框

#检查异常值
diamonds2 %>%
  filter(abs(lresid2) > 1) %>%    #过滤绝对值大于1，#abs()绝对值
  add_predictions(mod_diamond2) %>%     #增加预测
  mutate(pred = round(2 ^ pred)) %>%    #增加列  #round()四舍五入取整
  select(price, pred, carat:table, x:z) %>%   #选择部分列
  arrange(price)                              #按price排序

## # A tibble: 16 x 11
##    price  pred carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1013   264  0.25 Fair      F     SI2      54.4    64  4.3   4.23  2.32
##  2  1186   284  0.25 Premium   G     SI2      59      60  5.33  5.28  3.12
##  3  1186   284  0.25 Premium   G     SI2      58.8    60  5.33  5.28  3.12
##  4  1262  2644  1.03 Fair      E     I1       78.2    54  5.72  5.59  4.42
##  5  1415   639  0.35 Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  6  1415   639  0.35 Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  7  1715   576  0.32 Fair      F     VS2      59.6    60  4.42  4.34  2.61
##  8  1776   412  0.29 Fair      F     SI1      55.8    60  4.48  4.41  2.48
##  9  2160   314  0.34 Fair      F     I1       55.8    62  4.72  4.6   2.6 
## 10  2366   774  0.3  Very Good D     VVS2     60.6    58  4.33  4.35  2.63
## 11  3360  1373  0.51 Premium   F     SI1      62.7    62  5.09  4.96  3.15
## 12  3807  1540  0.61 Good      F     SI2      62.5    65  5.36  5.29  3.33
## 13  3920  1705  0.51 Fair      F     VVS2     65.4    60  4.98  4.9   3.23
## 14  4368  1705  0.51 Fair      F     VVS2     60.7    66  5.21  5.11  3.13
## 15 10011  4048  1.01 Fair      D     SI2      64.6    58  6.25  6.2   4.02
## 16 10470 23622  2.46 Premium   E     SI2      59.7    59  8.82  8.76  5.25

#多重共线性问题
#多重共线性是指线性回归模型中的解释变量之间由于
#存在精确相关关系或高度相关关系而使模型估计失真或难以估计准确。

处理字符串

#第十章  #B1P131
#文本


##字符串长度
  
length <- str_length(c("a","R for data science",NA))  #字符串长度
#结果：3个长度 1 18 NA


##字符串组合  

str_c("x","y",sep=",")  #字符串组合，以逗号分隔

## [1] "x,y"

#结果："x,y"

x<-c("abc",NA)

str_c("|-",x,"-|")

## [1] "|-abc-|" NA

#结果："|-abc-|" NA 

str_c("|-",str_replace_na(x),"-|")  #str_replace_na()将NA值转换为字符串"NA"

## [1] "|-abc-|" "|-NA-|"

#结果："|-abc-|" "|-NA-|"



##字符串取子集
x<-c("Apple","Banana","Pear")

str_sub(x,1,3)   #提取前3个字符

## [1] "App" "Ban" "Pea"

#结果："App" "Ban" "Pea" 

str_sub(x,-3,-1)  #提取后3个字符

## [1] "ple" "ana" "ear"

#结果："ple" "ana" "ear"

str_sub("a",1,5)  #若待提取的字符串过短则返回尽可能多的字符

## [1] "a"

#结果："a"

str_sub(x,1,1) <- str_to_lower(str_sub(x, 1, 1)) #将首字母转小写
x

## [1] "apple"  "banana" "pear"

#结果："apple"  "banana" "pear" 



##正则表达式    #B1P135
#精确匹配字符串
x <- c("apple", "banana", "pear")
str_view(x, "an")   #匹配"an"

str_view(x, ".a.")  #匹配"a"以及其前后的字符，共3个字符

                    #"."可以匹配除换行外的任意字符


##锚点 #B1P136
x <- c("apple", "banana", "pear")
str_view(x, "^a")   #"^"从字符串开头进行匹配

str_view(x, "a$")   #"$"从字符串末尾进行匹配

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")     #匹配完整字符串

str_view(x, "^apple$")   #包含锚点匹配完整字符串

##字符选项
str_view(c("grey", "gray"), "gr(e|a)y")  #"|"表示或者

##重复
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

str_view(x, "CC+")

str_view(x, 'C[LX]+')

#"?"：匹配0次或1次
#"+"：匹配1次或多次
#"*"：匹配0次或多次


#B1P142 #匹配检测
x <- c("apple","banana","pear")
str_detect(x,"e")  #返回一个与输入向量具有同样长度的逻辑向量

## [1]  TRUE FALSE  TRUE

#结果：TRUE FALSE  TRUE


#words：自带的词库
#sentences：Harvard sentences

# 有多少个以t开头的常用单词？
sum(str_detect(words, "^t"))

## [1] 65

#结果：65

# 以元音字母结尾的常用单词的比例是多少？
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

#结果：0.277

#以辅音开头的单词比例
(1-mean(str_detect(words, "^[aeiou]")))

## [1] 0.8214286

#结果：0.8214286



# 找出至少包含一个元音字母的所有单词，然后取反
no_vowels_1 <- !str_detect(words, "[aeiou]")

# 找出仅包含辅音字母（非元音字母）的所有单词
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")

identical(no_vowels_1, no_vowels_2) #检查两个元素是否相等

## [1] TRUE

#结果：TRUE 相等


#words中以x结尾的词（两种方法）
words[str_detect(words, "x$")]  #找出词的位置选出词

## [1] "box" "sex" "six" "tax"

#结果："box" "sex" "six" "tax"

str_subset(words, "x$")         #直接用匹配的方法

## [1] "box" "sex" "six" "tax"

#结果："box" "sex" "six" "tax"

#words中以x结尾的词且获得序号
df <- tibble(
  word = words,
  i = seq_along(word)
)
df %>%
  filter(str_detect(words, "x$"))

## # A tibble: 4 x 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841

#有多少句子
length(sentences)

## [1] 720

#结果：720
head(sentences) #返回开头的一部分

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

#找出包含一种颜色的句子
colors <- c(      #创建一个颜色向量
  "red", "orange", "yellow", "green", "blue", "purple"
)
color_match <- str_c(colors, collapse = "|")  #合并为一个正则表达式
color_match

## [1] "red|orange|yellow|green|blue|purple"

#结果："red|orange|yellow|green|blue|purple"


has_color <- str_subset(sentences, color_match)  #先提取出句子
matches <- str_extract(has_color, color_match)   #再提取出颜色
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

#结果： "blue" "blue" "red" "red" "red" "blue"

#此法仍有瑕疵，如实际运用还需改进
#单词 "flickered" 会匹配到red


#10.4.5 分组匹配 #B1P146

#找出名词，方式：找出前面有冠词的词
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>%
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

#结果：
#> [1] "the smooth" "the sheet" "the depth" "a chicken"
#> [5] "the parked" "the sun" "the huge" "the ball"
#> [9] "the woman" "a helps"

#关于字符串处理仅简要带过，教材仍有细节内容







#课上练习

#1.读入文本
#2.文本字数
#3.句子的个数（长句和短句，句号和逗号）
#4.单个句子的平均字数
#5.语气词统计（自行定义词库）（may/maybe/can/could/would/might/shall/will/must）
#6.单词原音结尾的单词比例
#7.抽样，可重复的每次从文本中抽取5个单词计算元音比例，重复10W次



##1.读入文本 

# #自动获取当前文件所在路径
# script_dir <- dirname(sys.frame(1)$ofile)    #获取脚本文件所在文件路径
# #print(script_dir)
# text_a_path = str_c(script_dir,"/text A.txt") #与要读取的文件名合成完整读取路径
# text_b_path = str_c(script_dir,"/text B.txt")

#确定相对路径
text_a_path = "./text A.txt" #与要读取的文件名合成完整读取路径
text_b_path = "./text B.txt"

##各种读取文件方法

#read.table()读入
#上述函数主要用来读取矩形表格数据，对于无结构的纯文本无法有效读入。
#且上述函数读取文本时，若尾行无换行，出现报错


#read.delim()读入
# text_a_read <- read.delim(file=text_a_path,    #读入
#                           header=FALSE,colClasses = "character",encoding="UTF-8")
# text_b_read <- read.delim(file=text_b_path,  #读入
#                           header=FALSE,colClasses = "character",encoding="UTF-8")
#可能为编码问题，文件B中一处引号无法识别，导致上述函数无法正确读取文件B
#上述函数读入后会形成数据框


#scan()读入
# text_a_read <- scan(text_a_path,what="",encoding="UTF-8")
# text_b_read <- scan(text_b_path,what="",encoding="UTF-8")
#使用scan函数读入后文章被分割为单词，句子信息丢失


#readLines()读入
#对于非结构化文本，使用readLines()函数读入
text_a <- readLines(text_a_path,encoding="UTF-8")  #编码设为于与文件相同

## Warning in readLines(text_a_path, encoding = "UTF-8"): 读'./text A.txt'时最后一
## 行未遂

text_b <- readLines(text_b_path,encoding="UTF-8")

## Warning in readLines(text_b_path, encoding = "UTF-8"): 读'./text B.txt'时最后一
## 行未遂

typeof(text_a)  #读入结果为字符

## [1] "character"

typeof(text_b)

## [1] "character"

# print(text_a)  #可正常获取
# print(text_b)


##2.文本字数

#按空格对全文分解，将句子拆解为单词   #B1P147
text_a_word <- str_split(text_a," ")   #此分拆未考虑标点符号的特殊情况，不准确
text_b_word <- str_split(text_b," ")

#因原文本有多段，分解后形成list，对整体求和得到单词数目
a_word_cnt <- sum(lengths(text_a_word))
b_word_cnt <- sum(lengths(text_b_word))

print(a_word_cnt)

## [1] 443

print(b_word_cnt)

## [1] 811

#结果 A：443，B：811


##3.句子的个数（长句和短句，句号和逗号）
#原理同上，按逗号句号拆分
# a_short <- str_split(text_a,",")  #以逗号结尾视作短句
# b_short <- str_split(text_b,",")
a_short <- str_split(text_a,",|[.]")  #以句号或逗号结尾视作短句
b_short <- str_split(text_b,",|[.]")  #即包含一个逗号的长句将视作两个短句
a_long <- str_split(text_a,"[.]")   #因"."在正则表达式中有其含义
b_long <- str_split(text_b,"[.]")   #如需要匹配"."需要加方括号

# print(a_short)
# print(b_short)
# print(a_long)
# print(b_long)


#因原文本有多段，分解后形成list，对整体求和得到单词数目
a_short_cnt <- sum(lengths(a_short))      
b_short_cnt <- sum(lengths(b_short))
a_long_cnt <- sum(lengths(a_long))
b_long_cnt <- sum(lengths(b_long))

print(a_short_cnt)

## [1] 49

print(a_long_cnt)

## [1] 27

print(b_short_cnt)

## [1] 77

print(b_long_cnt)

## [1] 42

#结果 A：短句49，长句27，B：短句77，长句42

##4.单个句子的平均字数
#使用单词数除以句子数量即可
mean_a_short <- a_word_cnt/a_short_cnt
mean_a_long <- a_word_cnt/a_long_cnt
  
mean_b_short <- b_word_cnt/b_short_cnt  
mean_b_long <- b_word_cnt/b_long_cnt

print(mean_a_short)

## [1] 9.040816

print(mean_a_long)

## [1] 16.40741

print(mean_b_short)

## [1] 10.53247

print(mean_b_long)

## [1] 19.30952

#结果 A：短句9.04，长句16.40，B：短句10.53，长句19.30



##5.语气词统计（自行定义词库）
#（may/maybe/can/could/would/might/shall/will/must）

tone_word <- c(      #创建一个语气词向量
  "may", "maybe", "can", "could", "would", "might",
  "shall", "will", "must"
)
tone_match <- str_c(tone_word, collapse = "|")  #合并为一个正则表达式
print(tone_match)

## [1] "may|maybe|can|could|would|might|shall|will|must"

#结果："may|maybe|can|could|would|might|shall|will|must"

tone_a_cnt <- sum(str_count(text_a,tone_match))  
tone_b_cnt <- sum(str_count(text_b,tone_match))

print(tone_a_cnt)

## [1] 7

print(tone_b_cnt)

## [1] 11

#结果：A语气词个数7，B语气词个数11



##6.单词原音结尾的单词比例

paragraph_cnt_a <- length(text_a_word) #对分拆单词后数据框求段落数量
vowel_a <- NA  #定义向量存储每段的单词数

for(i in 1:paragraph_cnt_a) #循环计算每段元音结尾的单词数目
{
  vowel_a[i] <- sum(str_count(text_a_word[[i]], "[aeiou]$"))
}

sum_vowel_a <- sum(vowel_a)

print(sum_vowel_a)

## [1] 108

#结果：以元音结尾单词数目为108

mean_vowel_a <- sum(vowel_a)/a_word_cnt  #108/443
print(mean_vowel_a)

## [1] 0.2437923

#结果：比例0.2437923


paragraph_cnt_b <- length(text_b_word) #对分拆单词后数据框求段落数量
vowel_b <- NA  #定义向量存储每段的单词数

for(i in 1:paragraph_cnt_b) #循环计算每段元音结尾的单词数目
{
  vowel_b[i] <- sum(str_count(text_b_word[[i]], "[aeiou]$"))
}

sum_vowel_b <- sum(vowel_b)

print(sum_vowel_b)

## [1] 194

#结果：以元音结尾单词数目为194

mean_vowel_b <- sum(vowel_b)/b_word_cnt  #194/811
print(mean_vowel_b)

## [1] 0.2392109

#结果：比例0.2392109



##7.抽样，可重复的每次从文本中抽取5个单词计算元音比例，重复10W次

#受段落影响，之前按单词分拆的结果为多段组成的list
#不便于取词，使用上问中取得的段落数，将多段合并

text_a_all <- text_a_word[[1]] #先取第一段
for(i in 2:paragraph_cnt_a) #循环计算每段元音结尾的单词数目
{
  text_a_all <- append(text_a_all, text_a_word[[i]])
}

text_b_all <- text_b_word[[1]] #先取第一段
for(i in 2:paragraph_cnt_b) #循环计算每段元音结尾的单词数目
{
  text_b_all <- append(text_b_all, text_b_word[[i]])
}

#print(text_a_all) #合并完成
#print(text_b_all) #合并完成

#A 的总单词数：a_word_cnt  #433
#B 的总单词数：b_word_cnt  #811

vowel_prop_a <- NA  
vowel_prop_b <- NA  #存放每次取值b中元音字母比例

for(i in 1:10000)
{
  #生成5个随机数
  random_a <- runif(5,min=1,max=a_word_cnt) #以均匀分布方式生成随机数
  random_b <- runif(5,min=1,max=b_word_cnt) #以均匀分布方式生成随机数
  
  #取词生成向量
  vector_a <- text_a_all[random_a]
  vector_b <- text_b_all[random_b]
  
  #测试生成元素的类型
  # print(is.vector(vector_a)) #TRUE
  # print(is.vector(vector_b)) #TRUE
  # print(typeof(vector_a))   #"character"
  # print(typeof(vector_b))   #"character"
  
  #输出生成的元素
  # print(vector_a)
  # print(vector_b)
  
  #计算元音字母数，以及字母总数,以及比例
  vowel_cnt_a <- sum(str_count(vector_a, "[aeiou]"))
  letter_cnt_a <- sum(str_count(vector_a, "[a-z]"))
  vowel_prop_a[i] <- vowel_cnt_a/letter_cnt_a
  
  # print(vowel_cnt_a)
  # print(letter_cnt_a)
  # print(vowel_prop_a)
  
  #计算元音字母数，以及字母总数,以及比例
  vowel_cnt_b <- sum(str_count(vector_b, "[aeiou]"))
  letter_cnt_b <- sum(str_count(vector_b, "[a-z]"))
  vowel_prop_b[i] <- vowel_cnt_b/letter_cnt_b
  
  # print(vowel_cnt_b)
  # print(letter_cnt_b)
  # print(vowel_prop_b)  
}

vowel_result_a <- mean(vowel_prop_a)
vowel_result_b <- mean(vowel_prop_b)

print(vowel_result_a)  #0.3909523 左右

## [1] 0.3915258

print(vowel_result_b)  #0.3850945 左右

## [1] 0.3856429

#对结果绘制直方图及密度曲线
hist(vowel_prop_a,prob=T,col="light blue")  #prob=T可以得到每个取值区间的概率
lines(density(vowel_prop_a),col="red",lwd=3)  #lwd=3 表示3倍默认线宽

#生成图近似服从正态分布

#对结果绘制直方图及密度曲线
hist(vowel_prop_b,prob=T,col="light blue")  #prob=T可以得到每个取值区间的概率
lines(density(vowel_prop_b),col="red",lwd=3)  #lwd=3 表示3倍默认线宽

#生成图近似服从正态分布

第4篇

参考书

安装与加载包

实现基础模型

模型构建

处理字符串