学一学CMU的15445,课程地址
Homework #1 - SQL
要求在 SQLite
和 Duckdb
上编写一些查询代码并执行,还是有些难度的
查询至少得过一枚奖牌的教练(即相同国家和项目的运动员或队伍得过至少一枚奖牌的教练),并按以奖牌数为第一关键字降序、以名字为第二关键字字典序排序
分析:
首先我们要找出教练的国家和项目
1
select ch.name, ch.country_code, ch.discipline from coaches as ch;
然后要找出奖牌的项目和胜者 code
1
select m.discipline, m.winner_code from medals as m;
将两者通过 discipline 结合
1
select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline;
先找运动员
1
2
3
4
5select tmp.ch_name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, athletes as a
where tmp.win = a.code and tmp.ch_ct = a.country_code
group by tmp.ch_name
having c > 0;再找队伍
这个地方需要注意,队伍 relation 中一个 tuple 表示一个队员,所以一个队伍可能会有很多 tuple,我们需要的只是 code 和 country_code,所以需要子查询
1
select code, country_code from teams group by code;
一个很有意思的事情,在
SQLite
中,上面的语句是可以正常执行的,但是在Duckdb
中会报错,因为机器不知道code
和country_code
之间存在关系,即不同的code
对应不同的country_code
,所以若只以code
分组会导致机器不知道要拿组内哪个country_code
的值来输出,产生不准确的查询,要么也以country_code
分组,要么加any_value()
函数,表示随意取一个值,但是在SQLite
中,没有这个函数1
select code, any_value(country_code) from teams group by code;
1
2
3
4
5select tmp.ch_name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, (select code, country_code as ct from teams group by code) as t
where tmp.win = t.code and tmp.ch_ct = t.ct
group by tmp.ch_name
having c > 0;最后排序(丑的一批)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18-- SQLite
select q1.name, sum(c) as s
from
(
select tmp.ch_name as name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, athletes as a
where tmp.win = a.code and tmp.ch_ct = a.country_code
group by tmp.ch_name
having c > 0
union all
select tmp.ch_name as name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, (select code, country_code as ct from teams group by code) as t
where tmp.win = t.code and tmp.ch_ct = t.ct
group by tmp.ch_name
having c > 0
) as q1
group by q1.name
order by s desc, q1.name;1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18-- Duckdb
select q1.name, sum(c) as s
from
(
select tmp.ch_name as name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, athletes as a
where tmp.win = a.code and tmp.ch_ct = a.country_code
group by tmp.ch_name
having c > 0
union all
select tmp.ch_name as name, count(*) as c
from (select ch.name as ch_name, ch.country_code as ch_ct, m.winner_code as win from coaches as ch, medals as m where ch.discipline = m.discipline) as tmp, (select code, any_value(country_code) as ct from teams group by code) as t
where tmp.win = t.code and tmp.ch_ct = t.ct
group by tmp.ch_name
having c > 0
) as q1
group by q1.name
order by s desc, q1.name;查询所有柔道选手的姓名和奖牌数,并按以奖牌数为第一关键字降序、以名字为第二关键字字典序排序
分析:
首先查询所有柔道选手
1
select name from athletes where disciplines = "['Judo']";
还有一个很有意思的事情就是
Duckdb
不支持双引号,所以你需要这样写1
select name from athletes where disciplines = '[''Judo'']';
然后选出所有的柔道奖牌
1
select winner_code from medals where discipline = 'Judo';
选手可能自己拿牌子,也可能通过队伍拿牌子,但是牌子数量是一定的
先算个人拿的牌子
1
2
3
4select a.name, count(*)
from athletes as a, medals as m
where a.disciplines = "['Judo']" and m.discipline = 'Judo' and m.winner_code = a.code
group by a.name;再算在队伍中拿的牌子
1
2
3select a.name as name, t.code as t_code
from athletes as a, teams as t
where a.disciplines = "['Judo']" and a.code = t.athletes_code;1
2
3
4
5
6
7
8select p.name, count(*)
from (
select a.name as name, t.code as t_code
from athletes as a, teams as t
where a.disciplines = "['Judo']" and a.code = t.athletes_code
) as p, medals as m
where p.t_code = m.winner_code
group by p.name;再合并排序,注意最后要加上奖牌数为
的选手 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22-- SQLite
select q2.name, sum(c) as s
from
(
select a.name as name, count(*) as c
from athletes as a, medals as m
where a.disciplines = "['Judo']" and m.discipline = 'Judo' and m.winner_code = a.code
group by a.name
union all
select p.name as name, count(*) as c
from (
select a.name as name, t.code as t_code
from athletes as a, teams as t
where a.disciplines = "['Judo']" and a.code = t.athletes_code
) as p, medals as m
where p.t_code = m.winner_code
group by p.name
union all
select name,0 from athletes where disciplines = "['Judo']"
) as q2
group by q2.name
order by s desc, q2.name;1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22-- Duckdb
select q2.name, sum(c) as s
from
(
select a.name as name, count(*) as c
from athletes as a, medals as m
where a.disciplines = '[''Judo'']' and m.discipline = 'Judo' and m.winner_code = a.code
group by a.name
union all
select p.name as name, count(*) as c
from (
select a.name as name, t.code as t_code
from athletes as a, teams as t
where a.disciplines = '[''Judo'']' and a.code = t.athletes_code
) as p, medals as m
where p.t_code = m.winner_code
group by p.name
union all
select name,0 from athletes where disciplines = '[''Judo'']'
) as q2
group by q2.name
order by s desc, q2.name;对于所有举办过田径项目的地点,列出所有在该地点比过赛的运动员,并按以其国籍所在国与代表国之间的距离为第一关键字降序、以名字为第二关键字字典序排序
分析:
先查所有举办过田径项目的地点所对应的比赛结果
1
select r.participant_code from results as r, venues as v where v.disciplines like '%Athletics%' and r.venue = v.venue;
再查人(还是分两部分,一部分个人,一部分队伍)
1
2
3
4
5
6
7
8
9select a.name as ATHLETE_NAME, any_value(a.country_code) as REPRESENTED_COUNTRY_CODE, any_value(a.nationality_code) as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, athletes as a
where rv.pc = a.code
group by a.name;1
2
3
4
5
6
7
8
9select a.name as ATHLETE_NAME, any_value(a.country_code) as REPRESENTED_COUNTRY_CODE, any_value(a.nationality_code) as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, teams as t, athletes as a
where rv.pc = t.code and t.athletes_code = a.code
group by a.name;再排序,有细节,注意只输出经纬度都不为空的数据,正确版本如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35-- Duckdb
with res as
(
select a.code, any_value(a.name) as ATHLETE_NAME, any_value(a.country_code) as REPRESENTED_COUNTRY_CODE, any_value(a.nationality_code) as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, athletes as a
where rv.pc = a.code
group by a.code
union
select a.code, any_value(a.name) as ATHLETE_NAME, any_value(a.country_code) as REPRESENTED_COUNTRY_CODE, any_value(a.nationality_code) as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, teams as t, athletes as a
where rv.pc = t.code and t.athletes_code = a.code
group by a.code
)
select res.ATHLETE_NAME, res.REPRESENTED_COUNTRY_CODE, res.NATIONALITY_COUNTRY_CODE
from res, countries as c1, countries as c2
where
(
res.REPRESENTED_COUNTRY_CODE = c1.code and
res.NATIONALITY_COUNTRY_CODE = c2.code and
c1.Latitude is not null and
c2.Latitude is not null and
c1.Longitude is not null and
c2.Longitude is not null
)
order by ((c1.Latitude-c2.Latitude)*(c1.Latitude-c2.Latitude)+(c1.Longitude-c2.Longitude)*(c1.Longitude-c2.Longitude)) desc, res.ATHLETE_NAME;在之前我犯了一个错误出现了如下的结果
Std:
My:
怎么活?
排查结果是,有一个重名的选手,也就是名字一样但编号不一样,
union
的时候我没输出编号,只输出了名字,所以union
语句自己给去重了,警钟长鸣1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35-- SQLite
with res as
(
select a.code, a.name as ATHLETE_NAME, a.country_code as REPRESENTED_COUNTRY_CODE, a.nationality_code as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, athletes as a
where rv.pc = a.code
group by a.code
union
select a.code, a.name as ATHLETE_NAME, a.country_code as REPRESENTED_COUNTRY_CODE, a.nationality_code as NATIONALITY_COUNTRY_CODE
from
(
select r.participant_code as pc
from results as r, venues as v
where v.disciplines like '%Athletics%' and r.venue = v.venue
) as rv, teams as t, athletes as a
where rv.pc = t.code and t.athletes_code = a.code
group by a.code
)
select res.ATHLETE_NAME, res.REPRESENTED_COUNTRY_CODE, res.NATIONALITY_COUNTRY_CODE
from res, countries as c1, countries as c2
where
(
res.REPRESENTED_COUNTRY_CODE = c1.code and
res.NATIONALITY_COUNTRY_CODE = c2.code and
c1.Latitude is not null and
c2.Latitude is not null and
c1.Longitude is not null and
c2.Longitude is not null
)
order by ((c1.Latitude-c2.Latitude)*(c1.Latitude-c2.Latitude)+(c1.Longitude-c2.Longitude)*(c1.Longitude-c2.Longitude)) desc, res.ATHLETE_NAME;对于每一天,找到当天在前五名(包括)中出现次数最多的国家,同时列出其人口排名和 GDP 排名,并按日期升序排序
分析:
先查每一天各个国家在前五名的记录,注意这里是
union all
1
2
3
4
5
6
7select r.date as date, a.country_code as country_code, r.rank
from results as r, athletes as a
where r.participant_code = a.code and r.rank <= 5 and r.rank is not null
union all
select r.date as date, t.country_code as country_code, r.rank
from results as r, (select teams.code as code, any_value(teams.country_code) as country_code from teams group by teams.code) as t
where r.participant_code = t.code and r.rank <= 5 and r.rank is not null;再通过横向查询算出每天各个国家在前五名中出现的次数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19with tms as
(
select r.date as date, a.country_code as country_code, r.rank
from results as r, athletes as a
where r.participant_code = a.code and r.rank <= 5 and r.rank is not null
union all
select r.date as date, t.country_code as country_code, r.rank
from results as r, (select teams.code as code, any_value(teams.country_code) as country_code from teams group by teams.code) as t
where r.participant_code = t.code and r.rank <= 5 and r.rank is not null
)
select distinct t1.date, t1.country_code, times.c
from tms as t1,
lateral
(
select count(*) as c
from tms as t2
where t2.date = t1.date and t2.country_code = t1.country_code
) as times
order by t1.date asc;利用窗口函数得到每天次数最大的一个 tuple
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31with max_t as
(
select
*,
row_number() over (partition by mdt.date order by mdt.c desc, mdt.country_code) as rk
from
(
with tms as
(
select r.date as date, a.country_code as country_code, r.rank
from results as r, athletes as a
where r.participant_code = a.code and r.rank <= 5 and r.rank is not null
union all
select r.date as date, t.country_code as country_code, r.rank
from results as r, (select teams.code as code, any_value(teams.country_code) as country_code from teams group by teams.code) as t
where r.participant_code = t.code and r.rank <= 5 and r.rank is not null
)
select distinct t1.date as date, t1.country_code as country_code, times.c as c
from tms as t1,
lateral
(
select count(*) as c
from tms as t2
where t2.date = t1.date and t2.country_code = t1.country_code
) as times
) as mdt
)
select m.date, m.country_code, m.c
from max_t as m
where rk = 1
order by m.date asc;查询 GDP_rank 和 populartion_rank
1
2
3
4
5
6
7
8select
code,
rank() over (order by Population desc) as rk_pop,
rank() over (order by "GDP ($ per capita)" desc) as rk_GDP
from countries
where
"GDP ($ per capita)" is not null and
Population is not null;合并!排序!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43-- Duckdb
with max_t as
(
select
*,
row_number() over (partition by mdt.date order by mdt.c desc, mdt.country_code) as rk
from
(
with tms as
(
select r.date as date, a.country_code as country_code, r.rank
from results as r, athletes as a
where r.participant_code = a.code and r.rank <= 5 and r.rank is not null
union all
select r.date as date, t.country_code as country_code, r.rank
from results as r, (select teams.code as code, any_value(teams.country_code) as country_code from teams group by teams.code) as t
where r.participant_code = t.code and r.rank <= 5 and r.rank is not null
)
select distinct t1.date as date, t1.country_code as country_code, times.c as c
from tms as t1,
lateral
(
select count(*) as c
from tms as t2
where t2.date = t1.date and t2.country_code = t1.country_code
) as times
) as mdt
)
select m.date as date, m.country_code as country_code, m.c as top5_appearances, ct_info.rk_GDP as gdp_rank, ct_info.rk_pop as population_rank
from
max_t as m,
(
select
code,
rank() over (order by Population desc) as rk_pop,
rank() over (order by "GDP ($ per capita)" desc) as rk_GDP
from countries
where
"GDP ($ per capita)" is not null and
Population is not null
) as ct_info
where rk = 1 and ct_info.code = m.country_code
order by m.date asc;SQLite
不支持横向查询lateral
关键字1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44-- SQLite
with max_t as
(
select
*,
row_number() over (partition by mdt.date order by mdt.c desc, mdt.country_code) as rk
from
(
with tms as
(
select r.date as date, a.country_code as country_code, r.rank
from results as r, athletes as a
where r.participant_code = a.code and r.rank <= 5 and r.rank is not null
union all
select r.date as date, t.country_code as country_code, r.rank
from results as r, (select teams.code as code, teams.country_code as country_code from teams group by teams.code) as t
where r.participant_code = t.code and r.rank <= 5 and r.rank is not null
)
select distinct t1.date as date, t1.country_code as country_code, times.c as c
from tms as t1
left join
(
select t2.date, t2.country_code, count(*) as c
from tms as t2
group by t2.date, t2.country_code
) as times
on t1.date = times.date and t1.country_code = times.country_code
) as mdt
)
select m.date as date, m.country_code as country_code, m.c as top5_appearances, ct_info.rk_GDP as gdp_rank, ct_info.rk_pop as population_rank
from
max_t as m,
(
select
code,
rank() over (order by Population desc) as rk_pop,
rank() over (order by "GDP ($ per capita)" desc) as rk_GDP
from countries
where
"GDP ($ per capita)" is not null and
Population is not null
) as ct_info
where rk = 1 and ct_info.code = m.country_code
order by m.date asc;对比于东京奥运会,列出在金牌数量上有最大进步的五个国家,对于这五个国家,列出所属所有全女生的队伍,并按以提升的金牌数为第一关键字降序、以国家 code 为第二关键字字典序、以队伍 code 为第三关键字字典序排序
分析:
先查出所有国家的金牌数量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22select s1.cc, sum(s)
from
(
select c.code as cc, count(*) as s
from medals as m, countries as c, athletes as a, medal_info as mi
where
m.winner_code = a.code and
a.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
union all
select c.code as cc, count(*) as s
from medals as m, countries as c, (select code, any_value(country_code) as country_code from teams group by code) as t, medal_info as mi
where
m.winner_code = t.code and
t.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
) as s1
group by cc;再查出所有国家进步的金牌数量,以及前五
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29select
s1.cc,
(sum(s)-any_value(tm.gold_medal)) as increased_gold_medal_number,
rank() over (order by increased_gold_medal_number desc) as rk
from
(
select c.code as cc, count(*) as s
from medals as m, countries as c, athletes as a, medal_info as mi
where
m.winner_code = a.code and
a.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
union all
select c.code as cc, count(*) as s
from medals as m, countries as c, (select code, any_value(country_code) as country_code from teams group by code) as t, medal_info as mi
where
m.winner_code = t.code and
t.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
) as s1 left join tokyo_medals as tm
on s1.cc = tm.country_code
where tm.gold_medal is not null
group by cc
order by rk
limit 5;找全女队
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20with sm1 as
(
select t.code as code, count(*) as c, any_value(t.country_code) as cc
from teams as t, athletes as a
where t.athletes_code = a.code
group by t.code
),
sm2 as
(
select t.code as code, count(*) as c, any_value(t.country_code) as cc
from teams as t, athletes as a, gender as g
where
t.athletes_code = a.code and
a.gender = g.id and
g.name = 'Female'
group by t.code
)
select sm1.cc, sm1.code
from sm1, sm2
where sm1.code = sm2.code and sm1.c = sm2.c;合并!排序!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54-- Duckdb
with tmp as
(
select
s1.cc,
(sum(s)-any_value(tm.gold_medal)) as increased_gold_medal_number,
rank() over (order by increased_gold_medal_number desc) as rk
from
(
select c.code as cc, count(*) as s
from medals as m, countries as c, athletes as a, medal_info as mi
where
m.winner_code = a.code and
a.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
union all
select c.code as cc, count(*) as s
from medals as m, countries as c, (select code, any_value(country_code) as country_code from teams group by code) as t, medal_info as mi
where
m.winner_code = t.code and
t.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
) as s1 left join tokyo_medals as tm
on s1.cc = tm.country_code
where tm.gold_medal is not null
group by cc
order by rk
limit 5
),
sm1 as
(
select t.code as code, count(*) as c, any_value(t.country_code) as cc
from teams as t, athletes as a
where t.athletes_code = a.code
group by t.code
),
sm2 as
(
select t.code as code, count(*) as c, any_value(t.country_code) as cc
from teams as t, athletes as a, gender as g
where
t.athletes_code = a.code and
a.gender = g.id and
g.name = 'Female'
group by t.code
)
select tmp.cc as country_code, tmp.increased_gold_medal_number as increased_gold_medal_number, sm1.code as team_code
from sm1, sm2, tmp
where tmp.cc = sm1.cc and sm1.code = sm2.code and sm1.c = sm2.c
order by increased_gold_medal_number desc, country_code, team_code;1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54-- SQLite
with tmp as
(
select
s1.cc,
(sum(s)-tm.gold_medal) as increased_gold_medal_number,
rank() over (order by (sum(s)-tm.gold_medal) desc) as rk
from
(
select c.code as cc, count(*) as s
from medals as m, countries as c, athletes as a, medal_info as mi
where
m.winner_code = a.code and
a.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
union all
select c.code as cc, count(*) as s
from medals as m, countries as c, (select code, country_code as country_code from teams group by code) as t, medal_info as mi
where
m.winner_code = t.code and
t.country_code = c.code and
mi.code = m.medal_code and
mi.name = 'Gold Medal'
group by c.code
) as s1 left join tokyo_medals as tm
on s1.cc = tm.country_code
where tm.gold_medal is not null
group by cc
order by rk
limit 5
),
sm1 as
(
select t.code as code, count(*) as c, t.country_code as cc
from teams as t, athletes as a
where t.athletes_code = a.code
group by t.code
),
sm2 as
(
select t.code as code, count(*) as c, t.country_code as cc
from teams as t, athletes as a, gender as g
where
t.athletes_code = a.code and
a.gender = g.id and
g.name = 'Female'
group by t.code
)
select tmp.cc as country_code, tmp.increased_gold_medal_number as increased_gold_medal_number, sm1.code as team_code
from sm1, sm2, tmp
where tmp.cc = sm1.cc and sm1.code = sm2.code and sm1.c = sm2.c
order by increased_gold_medal_number desc, country_code, team_code;
关于本文
由 wsy_jim 撰写, 采用 CC BY-NC 4.0 许可协议.