290 lines
9.9 KiB
SAS
290 lines
9.9 KiB
SAS
options ls=256 ps=32767 nodate validmemname=extend validvarname=any;
|
||
|
||
title 'The SAS System';
|
||
|
||
%macro print(d);
|
||
proc print data=&d;run;
|
||
%mend;
|
||
%macro printobs(d,obs);
|
||
proc print data=&d (obs=&obs);run;
|
||
%mend;
|
||
%macro printfirstobsobs(d,firstobs,obs);
|
||
proc print data=&d (firstobs=&firstobs obs=&obs);run;
|
||
%mend;
|
||
%macro contents(d);
|
||
proc contents data=&d varnum;run;
|
||
%mend;
|
||
%macro contentsshort(d);
|
||
proc contents data=&d varnum short;run;
|
||
%mend;
|
||
%macro save_dataset(d);
|
||
data "d:&d";
|
||
set &d;
|
||
run;
|
||
%mend;
|
||
%macro load_dataset(d);
|
||
data &d;
|
||
set "d:&d";
|
||
run;
|
||
%mend;
|
||
%macro kill;
|
||
PROC DATASETS LIB=work KILL;RUN;quit;
|
||
%mend;
|
||
proc template;
|
||
list styles;
|
||
run;
|
||
|
||
|
||
|
||
%kill;
|
||
PROC IMPORT OUT=WORK.raw_metadf
|
||
DATAFILE="d:airquality.xlsx"
|
||
DBMS=EXCEL REPLACE;
|
||
RANGE="metadf$";
|
||
GETNAMES=YES;
|
||
MIXED=YES;
|
||
SCANTEXT=YES;
|
||
USEDATE=NO;
|
||
SCANTIME=NO;
|
||
RUN;
|
||
%print(raw_metadf);
|
||
%save_dataset(raw_metadf); *原始数据集存盘;
|
||
|
||
PROC IMPORT OUT=WORK.raw_airqualitydf
|
||
DATAFILE="d:airquality.xlsx"
|
||
DBMS=EXCEL REPLACE;
|
||
RANGE="airqualitydf$";
|
||
GETNAMES=YES;
|
||
MIXED=YES;
|
||
SCANTEXT=YES;
|
||
USEDATE=NO; *为YES崩溃闪退;
|
||
SCANTIME=NO; *为YES崩溃闪退;
|
||
RUN;
|
||
%print(raw_airqualitydf);
|
||
%save_dataset(raw_airqualitydf); *原始数据集存盘;
|
||
|
||
|
||
%kill;
|
||
/*加载硬盘原始数据集*/
|
||
%load_dataset(raw_metadf);
|
||
%load_dataset(raw_airqualitydf);
|
||
/*查看数据集内容*/
|
||
%contents(raw_metadf);
|
||
%contentsshort(raw_metadf);
|
||
/*site name Area lon lat*/
|
||
%contents(raw_airqualitydf);
|
||
%contentsshort(raw_airqualitydf);
|
||
/*datetime site 'CO_mg/m3'n 'CO_24h_mg/m3'n 'NO2_μg/m3'n 'NO2_24h_μg/m3'n 'O3_μg/m3'n 'O3_24h_μg/m3'n 'O3_8h_μg/m3'n 'O3_8h_24h_μg/m3'n 'PM10_μg/m3'n 'PM10_24h_μg/m3'n 'PM2#5_μg/m3'n 'PM2#5_24h_μg/m3'n 'SO2_μg/m3'n 'SO2_24h_μg/m3'n AQI PrimaryPollutant Quality Unheathful*/
|
||
|
||
%printobs(raw_metadf,10);
|
||
%printobs(raw_airqualitydf,10);
|
||
proc sort data=raw_metadf out=metadfsorted;
|
||
by site;
|
||
run;
|
||
proc sort data=raw_airqualitydf out=airqualitydfsorted;
|
||
by site;
|
||
run;
|
||
/*合并数据集,数据预处理,提取日期、时间部分,划分day、night*/
|
||
data airquality;
|
||
retain datetime date time DayNight site name Area AQI lon lat;
|
||
length DayNight $ 5;
|
||
merge metadfsorted airqualitydfsorted;
|
||
by site;
|
||
date=datepart(datetime);
|
||
time=timepart(datetime);
|
||
if '8:00't<=time<'20:00't then DayNight='day';
|
||
else DayNight='night';
|
||
format datetime e8601dt25. date yymmdd10. time time5.;
|
||
keep site name Area lon lat datetime date time DayNight AQI;
|
||
run;
|
||
%printobs(airquality,100);
|
||
%save_dataset(airquality); *合并数据集存盘;
|
||
|
||
|
||
|
||
%kill;
|
||
/*#################### DATA SET airquality ####################*/
|
||
/*加载硬盘合并数据集*/
|
||
%load_dataset(airquality);
|
||
%printobs(airquality,100);
|
||
|
||
/*检查site、name数量是否一致,发现不一致,后以site进行统计*/
|
||
proc sql;
|
||
select count(distinct(site)) as count_site from airquality;
|
||
select count(distinct(name)) as count_name from airquality;
|
||
quit;
|
||
/*
|
||
count_site
|
||
1714
|
||
count_name
|
||
1522
|
||
*/
|
||
|
||
|
||
/*#########################################################################*/
|
||
/*按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数*/
|
||
/*#########################################################################*/
|
||
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
|
||
ods pdf file='d:means.pdf' style=sapphire dpi=1200;
|
||
proc means data=airquality median maxdec=1;
|
||
class site DayNight;
|
||
var AQI;
|
||
where AQI is not missing;
|
||
run;
|
||
ods pdf close;
|
||
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
|
||
|
||
|
||
/*####################################################################################################*/
|
||
/*按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)*/
|
||
/*####################################################################################################*/
|
||
/*输出查看所有采样点AQI30%分位值,为与SQL验证,可略去*/
|
||
proc univariate data=airquality noprint;
|
||
var AQI;
|
||
output out=airqualitystats pctlpts=30 pctlpre=P;
|
||
run;
|
||
%print(airqualitystats); /*42*/
|
||
|
||
/*输出所有采样点AQI30%分位值到宏变量*/
|
||
proc sql;
|
||
select AQI into : xvalues separated by ',' from airquality;
|
||
select distinct(pctl(30, &xvalues)) into : P30 from airquality;
|
||
quit;
|
||
/*查看所有采样点AQI30%分位值的宏变量值,为后续调用*/
|
||
%put P30=&P30.;
|
||
|
||
/*按所有采样点AQI30%分位值对AQI分级,对合并数据集所有采样点所有数据直接分级,后续用各采样点中位数进行统计,可略去*/
|
||
data airquality1;
|
||
set airquality;
|
||
if AQI<&P30. then quality='good';
|
||
else quality='fair';
|
||
run;
|
||
%printobs(airquality1,100);
|
||
|
||
/*输出所有采样点AQI中位数*/
|
||
proc means data=airquality median maxdec=1;
|
||
class Area site;
|
||
var AQI;
|
||
where AQI is not missing;
|
||
output out=airqualitymedian median=;
|
||
run;
|
||
%print(airqualitymedian);
|
||
|
||
/*按所有采样点AQI30%分位值对AQI中位数分级*/
|
||
data airqualitymedian1;
|
||
set airqualitymedian;
|
||
if AQI<&P30. then quality='good';
|
||
else quality='fair';
|
||
where _TYPE_=3;
|
||
run;
|
||
%print(airqualitymedian1);
|
||
|
||
/*按城市统计低于所有采样点AQI30%分位值的采样点占比,查看结果*/
|
||
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
|
||
ods pdf file='d:freq.pdf' style=sapphire dpi=1200;
|
||
proc freq data=airqualitymedian1;
|
||
table Area*quality /nocol nopercent;
|
||
run;
|
||
ods pdf close;
|
||
/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
|
||
/*按城市统计低于所有采样点AQI30%分位值的采样点占比,输出频数统计结果到数据集*/
|
||
proc freq data=airqualitymedian1;
|
||
table Area*quality /outpct out=airqualitymedianoutrow(drop=percent pct_col); *保留行列频数与行百分比;
|
||
run;
|
||
%printobs(airqualitymedianoutrow,100);
|
||
|
||
/*按城市对采样点数进行统计,查看结果,可略去*/
|
||
proc means data=airqualitymedianoutrow sum maxdec=0;
|
||
class Area;
|
||
var COUNT;
|
||
run;
|
||
/*输出采样点数不低于5个的城市,查看结果,可略去*/
|
||
proc sql;
|
||
select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
|
||
quit;
|
||
/*将采样点数不低于5个的城市,输出到数据集*/
|
||
proc sql;
|
||
create table airqualitymedianoutrow5 as select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
|
||
quit;
|
||
/*列出上述占比最高的10个城市(不含采样点数低于5个的城市)*/
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
ods pdf file='d:airqualitymedianoutrow5.pdf' style=sapphire dpi=1200;
|
||
%printobs(airqualitymedianoutrow5,10);
|
||
%printobs(airqualitymedianoutrow5,20);
|
||
%printobs(airqualitymedianoutrow5,30);
|
||
%print(airqualitymedianoutrow5);
|
||
ods pdf close;
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
|
||
|
||
/*#####################################################*/
|
||
/*按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异*/
|
||
/*#####################################################*/
|
||
/*发现有的site没有Area对应*/
|
||
proc sql;
|
||
select distinct(Area),count(distinct(Area)) as count_Area from airquality;
|
||
quit;
|
||
proc print data=airquality;
|
||
where Area is missing;
|
||
run;
|
||
proc sql;
|
||
select distinct(site),count(distinct(site)) as count_site from airquality where Area is missing;
|
||
quit;
|
||
/*有4个site(采样点)没有Area(城市)对应
|
||
site count_site
|
||
2628A 4
|
||
3128A 4
|
||
4034A 4
|
||
4036A 4
|
||
*/
|
||
proc sort data=airquality out=airqualitysorted;
|
||
by Area;
|
||
run;
|
||
/*按照不同城市分组,统计白天与夜晚AQI中位数,查看结果,可略去*/
|
||
proc means data=airqualitysorted median maxdec=1;
|
||
by Area;
|
||
class DayNight;
|
||
var AQI;
|
||
where AQI is not missing;
|
||
run;
|
||
/*笼统地看,白天与夜晚AQI中位数是否具有显著差异*/
|
||
proc npar1way data=airquality median;
|
||
class DayNight;
|
||
var AQI;
|
||
run;
|
||
/*按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异*/
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
ods pdf file='d:npar1waymedian.pdf' style=sapphire dpi=1200;
|
||
proc npar1way data=airqualitysorted median;
|
||
class DayNight;
|
||
var AQI;
|
||
by Area;
|
||
where Area is not missing;
|
||
run;
|
||
ods pdf close;
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
/*
|
||
Using Wilcoxon scores in the linear rank statistic for two-sample data produces the rank sum statistic of the Mann-Whitney-Wilcoxon test.
|
||
Using Wilcoxon scores in the one-way ANOVA statistic produces the Kruskal-Wallis test.
|
||
Wilcoxon scores are locally most powerful for location shifts of a logistic distribution.
|
||
*//*
|
||
Using median scores in the linear rank statistic for two-sample data produces the two-sample median test.
|
||
The one-way ANOVA statistic with median scores is equivalent to the Brown-Mood test.
|
||
Median scores are particularly powerful for distributions that are symmetric and heavy-tailed.*/
|
||
|
||
/*
|
||
Scores for Linear Rank and One-Way ANOVA Tests
|
||
For each score type that you specify, PROC NPAR1WAY computes a one-way ANOVA statistic and also a linear rank statistic for two-sample data. The following score types are used primarily to test for differences in location: Wilcoxon, median, Van der Waerden (normal), and Savage. The following scores types are used to test for scale differences: Siegel-Tukey, Ansari-Bradley, Klotz, and Mood. Conover scores can be used to test for differences in both location and scale. This section gives formulas for the score types available in PROC NPAR1WAY. For further information about the formulas and the applicability of each score, see Randles and Wolfe (1979), Gibbons and Chakraborti (2010), Conover (1999), and Hollander and Wolfe (1999).
|
||
In addition to the score types described in this section, you can specify the SCORES=DATA option to use the input data observations as scores. This enables you to produce a wide variety of tests. You can construct any scores by using the DATA step, and then you can use PROC NPAR1WAY to compute the corresponding linear rank and one-way ANOVA tests for these scores. You can also analyze raw (unscored) data by using the SCORES=DATA option; for two-sample data, the corresponding exact test is a permutation test that is known as Pitman’s test.
|
||
*/
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
ods pdf file='d:npar1wayConover.pdf' style=sapphire dpi=1200;
|
||
proc npar1way data=airqualitysorted Conover;
|
||
class DayNight;
|
||
var AQI;
|
||
by Area;
|
||
where Area is not missing;
|
||
run;
|
||
/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
|
||
/*Conover scores can be used to test for differences in both location and scale.*/
|