KoreanFoodie's Study

SQL 기초 - 13. 분석과 통계(소계, 순위, 통계, 피봇) 본문

Database

SQL 기초 - 13. 분석과 통계(소계, 순위, 통계, 피봇)

GoldGiver 2021. 10. 21. 16:23

이 글은 '김상형의 SQL 정복'의 예제 코드를 설명하는 방식으로 SQL 쿼리 예제를 소개하고 있습니다.

 

SQL 기초 - 13. 분석과 통계(소계, 순위, 통계, 피봇)

행 범위 지정

CREATE TABLE tMonthSale
(
	year INT,
	month INT,
	sales	INT
);

INSERT INTO tMonthSale VALUES(2021, 9, 3650);
INSERT INTO tMonthSale VALUES(2021, 10, 4120);
INSERT INTO tMonthSale VALUES(2021, 11, 5000);
INSERT INTO tMonthSale VALUES(2021, 12, 4420);
INSERT INTO tMonthSale VALUES(2022, 1, 3800);
INSERT INTO tMonthSale VALUES(2022, 2, 4200);
INSERT INTO tMonthSale VALUES(2022, 3, 4150);

SELECT gender, SUM(salary) FROM tStaff GROUP BY gender;
SELECT depart, SUM(salary) FROM tStaff GROUP BY depart;
SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY depart, gender;

SELECT depart, SUM(salary) FROM tStaff GROUP BY ROLLUP(depart);

-- ROLLUP을 이용하면 그룹별 소계와 총계를 같이 계산한다!
SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY ROLLUP(depart, gender);

SELECT gender, depart, SUM(salary) FROM tStaff GROUP BY ROLLUP(gender, depart);

-- ROLLUP을 다른 컬럼과 같이 조합해서 사용할 수도 있다.
SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY depart, ROLLUP(gender);
SELECT gender, depart, SUM(salary) FROM tStaff GROUP BY gender, ROLLUP(depart);

-- ROLLUP이 1차 그룹핑 기준 필드에 대해서만 소계를 구한다면,
-- CUBE는 모든 기준 필드에 대한 소계를 다 구한다.	
SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY CUBE(depart, gender);

SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY ROLLUP(depart, gender)
UNION
SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY ROLLUP(gender, depart);

SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY CUBE(gender, depart);

SELECT depart, gender, SUM(salary) FROM tStaff GROUP BY GROUPING SETS(depart, gender);

-- GROUPING을 이용하면 ROLLUP된 데이터를 조금 더 직관적으로 나타낼 수 있다.
SELECT GROUPING(depart), depart, SUM(salary) FROM tStaff GROUP BY ROLLUP(depart);

-- GROUPING과 CASE문을 조합하여 NULL인 부서를 '전체부서'로 표현했다.
SELECT CASE GROUPING(depart) WHEN 1 THEN '전체부서' ELSE depart END AS depart, 
SUM(salary) FROM tStaff GROUP BY ROLLUP(depart);

SELECT CASE GROUPING(depart) WHEN 1 THEN '전체부서' ELSE depart END AS depart,
	CASE GROUPING(gender) WHEN 1 THEN '전체성별' ELSE gender END AS gender, 
	SUM(salary) FROM tStaff GROUP BY CUBE(depart, gender)
	ORDER BY /*GROUPING(depart), GROUPING(gender),*/ depart, gender;

SELECT name, depart, salary, SUM(salary) FROM tStaff;

SELECT depart, SUM(salary) FROM tStaff GROUP BY depart;

SELECT name, depart, salary, (SELECT SUM(salary) FROM tStaff) AS 월급총합 FROM tStaff;

-- OVER 절로 GROUP BY를 간단히 대체할 수 있다. (전 직원에 대한 월급을 구함)
SELECT name, depart, salary, SUM(salary) OVER() AS 월급총합 FROM tStaff;

-- 전 직원에 대한 월급 비율을 구하는 쿼리
SELECT name, depart, salary, ROUND(salary * 100.0 / SUM(salary) OVER(), 2) 
AS 월급비율 FROM tStaff;

SELECT name, depart, salary, (SELECT SUM(salary) FROM tStaff GROUP BY depart) 
AS 부서월급총합 FROM tStaff;

-- 연관 서브쿼리를 적절히 활용하자.
SELECT name, depart, salary, (SELECT SUM(salary) FROM tStaff WHERE depart = A.depart) 
AS 부서월급총합 FROM tStaff A ORDER BY depart;

-- PARTITION BY를 이용하여 선택한 컬럼별로 분류를 할 수 있다.
SELECT name, depart, salary, SUM(salary) OVER(PARTITION BY depart) 
AS 부서월급총합 FROM tStaff;

-- ORDER BY로 누적 월급을 구할 수 있다. (여기서는 이름 순으로 계속 누적됨)
SELECT name, depart, salary, SUM(salary) OVER(ORDER BY name) 
AS 누적월급 FROM tStaff;

SELECT name, depart, salary, SUM(salary) OVER(PARTITION BY depart ORDER BY name) 
AS 부서누적월급 FROM tStaff;

SELECT name, depart, salary, (SELECT SUM(salary) FROM tStaff 
WHERE name <= A.name) AS 누적월급 FROM tStaff A ORDER BY name;
SELECT name, depart, salary, (SELECT SUM(salary) FROM tStaff 
WHERE name <= A.name AND depart = A.depart) AS 부서누적월급 
FROM tStaff A ORDER BY depart, name;

-- ROWS BETWEEN 숫자 PRECEDING AND 숫자 FOLLOWING으로 위아래 행 범위를 지정할 수 있다
SELECT name, depart, salary, SUM(salary) OVER(ORDER BY name 
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS 누적월급 FROM tStaff;

SELECT name, depart, salary, SUM(salary) OVER(ORDER BY name 
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS 누적월급 FROM tStaff;

SELECT name, depart, joindate, salary, 
SUM(salary) OVER(PARTITION BY depart ORDER BY joindate 
ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS 누적월급 FROM tStaff;

SELECT name, depart, salary, SUM(salary) OVER(ORDER BY salary 
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS 누적월급 FROM tStaff;

-- RANGE를 쓰면 자기 자신과 같은 월급을 받는 뒤쪽행의 값까지 취한다.
-- ROWS를 쓰면 자기 자신 까지만 값을 취한다.
SELECT name, depart, salary, SUM(salary) OVER(ORDER BY salary 
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS 누적월급 FROM tStaff;

-- RANK함수는 지정한 기준으로 정렬한 순위를 구한다. DESC가 기본이고, ASC로 역순 정렬도 가능하다.
SELECT RANK() OVER (ORDER BY salary DESC), tStaff.* FROM tStaff;
MSSQL, 마리아 : SELECT RANK() OVER (ORDER BY salary DESC), * FROM tStaff;

SELECT RANK() OVER (ORDER BY salary DESC), tStaff.* FROM tStaff ORDER BY score;

SELECT RANK() OVER (ORDER BY salary DESC) AS 순위, tStaff.* FROM tStaff 
WHERE depart = '영업부';

SELECT RANK() OVER (PARTITION BY depart ORDER BY salary DESC) 
AS 순위, tStaff.* FROM tStaff;

-- RANK는 공동 순위를 순위에 포함시키는데, DENSE_RANK는 공동 순위를 무시하고 순위를 계산한다.
SELECT DENSE_RANK() OVER (ORDER BY salary DESC), tStaff.* FROM tStaff;

-- ROW_NUMBER 함수는 행에 일련번호를 붙인다. 오라클에서는 rownum이 기본 의사 컬럼으로 존재한다.
SELECT ROW_NUMBER() OVER (ORDER BY name), tStaff.* FROM tStaff;

SELECT ROW_NUMBER() OVER (ORDER BY rownum) AS 순서, tStaff.* FROM tStaff;

-- OFFSET으로 페이지를 만들 수 있다.
-- 여기서는 5가 한 페이지의 크기며, 1~5가 0페이지, 6~10이 1페이지이다.
SELECT ROW_NUMBER() OVER (ORDER BY name) AS 순서, tStaff.* FROM tStaff
ORDER BY 순서 OFFSET 2 * 5 ROWS FETCH NEXT 5 ROWS ONLY;

SELECT * FROM
(SELECT ROW_NUMBER() OVER (ORDER BY name) AS 순서, tStaff.* FROM tStaff) S
WHERE S.순서 > 2 * 5 AND rownum <= 5;

SELECT TOP(5) * FROM
(SELECT ROW_NUMBER() OVER (ORDER BY name) AS 순서, tStaff.* FROM tStaff) S
WHERE S.순서 > 2 * 5;

SELECT ROW_NUMBER() OVER (ORDER BY name) AS 순서, tStaff.* FROM tStaff LIMIT 11, 5;

-- NTILE함수는 레코드의 집합을 n개의 영역으로 나눈다. 여기서는 4분위수.
SELECT NTILE(4) OVER (ORDER BY salary DESC) AS 구간, name, salary FROM tStaff;
SELECT NTILE(4) OVER (PARTITION BY gender ORDER BY salary DESC) 
AS 구간, name, gender, salary FROM tStaff;

SELECT name, score FROM 
(SELECT NTILE(5) OVER (ORDER BY score DESC) AS 구간, tStaff.* FROM tStaff) S 
WHERE S.구간 = 3;

-- LAG는 이전 행의 필드를 읽고 LEAD는 다음 행의 필드를 읽는다.
SELECT year, month, sales,
	LAG(sales) OVER (ORDER BY year, month) AS priorMonth,
	LEAD(sales) OVER (ORDER BY year, month) AS nextMonth
FROM tMonthSale;

SELECT year, month, sales,
	sales - LAG(sales) OVER (ORDER BY year, month) AS incsales
FROM tMonthSale;

SELECT year, month, sales,
	LAG(sales) OVER (PARTITION BY year ORDER by year, month) AS priorMonth,
	LEAD(sales) OVER (PARTITION BY year ORDER by year, month) AS nextMonth
FROM tMonthSale;

-- CUME_DIST 함수는 행의 누적 비율을 구하여 0~1 사이의 실수로 리턴한다.
-- PERCENT_RANK 함수도 비슷하게 동작하되 첫 행을 빼고 계산한다.
SELECT year, month, sales,
	ROUND(CUME_DIST() OVER (ORDER BY year, month) * 100, 2) AS cume,
	ROUND(PERCENT_RANK() OVER (ORDER BY year, month) * 100, 2) AS rank
FROM tMonthSale;

SELECT name, salary,
	ROUND(CUME_DIST() OVER (ORDER BY salary) * 100, 2) AS cume,
	ROUND(PERCENT_RANK() OVER (ORDER BY salary) * 100, 2) AS rank
FROM tStaff;

SELECT depart, name, salary,
	ROUND(CUME_DIST() OVER (PARTITION BY depart ORDER BY salary) * 100, 2) AS cume,
	ROUND(PERCENT_RANK() OVER (PARTITION BY depart ORDER BY salary) * 100, 2) AS rank
FROM tStaff;

-- FIRST_VALUE는 그룹 내의 첫 값을 구하고, LAST_VALUE는 마지막 값을 구한다.
SELECT name, salary,
	FIRST_VALUE(salary) OVER (ORDER BY salary) AS first,
	LAST_VALUE(salary) OVER (ORDER BY salary) AS midlast,
	LAST_VALUE(salary) OVER (ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last
FROM tStaff;

SELECT name, salary,
	salary - FIRST_VALUE(salary) OVER (ORDER BY salary) AS 최저월급기준,
	LAST_VALUE(salary) OVER (ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - salary AS 최고월급기준
FROM tStaff ORDER BY name;

SELECT depart, name, salary,
	salary - FIRST_VALUE(salary) OVER (PARTITION BY depart ORDER BY salary) AS 최저월급기준,
	LAST_VALUE(salary) OVER (PARTITION BY depart ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) - salary AS 최고월급기준
FROM tStaff ORDER BY depart, salary;

SELECT year, month, sales,	
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sales) 
OVER (PARTITION BY year) AS cont,	
PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY sales) 
OVER (PARTITION BY year) AS disc
FROM tMonthSale;

CREATE TABLE tSeason
(
	item VARCHAR(10),
	season VARCHAR(10),
	sale INT
);

INSERT INTO tSeason VALUES ('냉면', '봄', 20);
INSERT INTO tSeason VALUES ('냉면', '여름', 50);
INSERT INTO tSeason VALUES ('냉면', '가을', 30);
INSERT INTO tSeason VALUES ('냉면', '겨울', 10);
INSERT INTO tSeason VALUES ('짬뽕', '봄', 30);
INSERT INTO tSeason VALUES ('짬뽕', '여름', 10);
INSERT INTO tSeason VALUES ('짬뽕', '가을', 20);
INSERT INTO tSeason VALUES ('짬뽕', '겨울', 40);

-- PIVOT 연산자는 행으로 나열되어 있는 데이터를 열로 나열하여 보기 쉽게 가공한다.
SELECT * FROM tSeason
PIVOT (MAX(sale) FOR season IN ('봄', '여름', '가을', '겨울'));

SELECT * FROM tSeason
PIVOT (MAX(sale) FOR season IN ('여름', '봄', '가을'));

SELECT * FROM tSeason
PIVOT (MAX(sale) FOR item IN ('냉면', '짬뽕'))

SELECT * FROM tSeason
PIVOT (sale FOR season IN ('봄', '여름', '가을', '겨울')) pvt;

SELECT * FROM tSeason
PIVOT (SUM(sale) FOR season IN ('봄', '여름', '가을', '겨울')) pvt;

INSERT INTO tSeason2 VALUES ('냉면', '봄', 20);
INSERT INTO tSeason2 VALUES ('냉면', '여름', 50);
INSERT INTO tSeason2 VALUES ('냉면', '가을', 30);
INSERT INTO tSeason2 VALUES ('짬뽕', '봄', 30);
INSERT INTO tSeason2 VALUES ('짬뽕', '가을', 20);
INSERT INTO tSeason2 VALUES ('짬뽕', '겨울', 40);
INSERT INTO tSeason2 VALUES ('짬뽕', '겨울', 30);

SELECT * FROM tSeason2 PIVOT (MAX(sale) FOR season IN ('봄', '여름', '가을', '겨울')) pvt;
SELECT * FROM tSeason2 PIVOT (SUM(sale) FOR season IN ('봄', '여름', '가을', '겨울')) pvt;

CREATE TABLE tTraffic
(
	line VARCHAR(10),
	hour INT,
	car VARCHAR(20),
	traffic INT
);

INSERT INTO tTraffic VALUES ('경부', 1, '승용차', 40);
INSERT INTO tTraffic VALUES ('경부', 2, '승용차', 41);
INSERT INTO tTraffic VALUES ('경부', 3, '승용차', 42);
INSERT INTO tTraffic VALUES ('경부', 1, '트럭', 30);
INSERT INTO tTraffic VALUES ('경부', 3, '트럭', 32);
INSERT INTO tTraffic VALUES ('호남', 1, '승용차', 20);
INSERT INTO tTraffic VALUES ('호남', 2, '승용차', 10);
INSERT INTO tTraffic VALUES ('호남', 2, '승용차', 11);
INSERT INTO tTraffic VALUES ('호남', 3, '승용차', 22);
INSERT INTO tTraffic VALUES ('호남', 1, '트럭', 10);
INSERT INTO tTraffic VALUES ('호남', 2, '트럭', 11);
INSERT INTO tTraffic VALUES ('호남', 3, '트럭', 12);

SELECT * FROM tTraffic PIVOT (SUM(traffic) FOR line IN ('경부', '호남')) pvt;
SELECT * FROM tTraffic PIVOT (SUM(traffic) FOR hour IN ('1', '2', '3')) pvt;

SELECT line, car, traffic FROM tTraffic 
PIVOT (SUM(traffic) FOR car IN ('승용차', '트럭')) pvt;

-- 원하는 필드만 출력하고 싶다면 서브쿼리에 PREPVT 인라인뷰를 정의하고 이를 피봇해야 한다.
SELECT * FROM
(
	SELECT line, car, traffic FROM tTraffic
) prepvt
PIVOT (SUM(traffic) FOR car IN ('승용차', '트럭')) pvt;

SELECT line, SUM(승용차), SUM(트럭) FROM tTraffic
PIVOT (SUM(traffic) FOR car IN ('승용차' AS 승용차, '트럭' AS 트럭)) pvt
GROUP BY line;

SELECT * FROM
(
	SELECT hour, car, traffic FROM tTraffic
) prepvt
PIVOT (SUM(traffic) FOR car IN ('승용차', '트럭')) pvt;

SELECT * FROM
(
	SELECT car, traffic FROM tTraffic
) prepvt
PIVOT (SUM(traffic) FOR car IN ('승용차', '트럭')) pvt;

SELECT line, 트럭, 승용차 FROM
(
	SELECT line, hour, car, traffic FROM tTraffic
) prepvt
PIVOT (SUM(traffic) FOR car IN ('승용차' AS 승용차, '트럭' AS 트럭)) pvt;

SELECT line || '선 ' || CAST(hour AS VARCHAR(10)) || '시' AS 구분, 트럭, 승용차 FROM
(
	SELECT line, hour, car, traffic FROM tTraffic
) prepvt
PIVOT (SUM(traffic) FOR car IN ('승용차' AS 승용차, '트럭' AS 트럭)) pvt
ORDER BY line;

CREATE TABLE tCityStat
(
	name CHAR(10),
	attr CHAR(10),
	value INT
);

INSERT INTO tCityStat VALUES ('서울', 'area', 605);
INSERT INTO tCityStat VALUES ('서울', 'popu', 974);
INSERT INTO tCityStat VALUES ('서울', 'gu', 25);
INSERT INTO tCityStat VALUES ('인제', 'area', 1646);
INSERT INTO tCityStat VALUES ('인제', 'popu', 3);
INSERT INTO tCityStat VALUES ('인제', 'home', 15409);
INSERT INTO tCityStat VALUES ('홍천', 'area', 1819);

SELECT * FROM tCityStat
PIVOT (MAX(value) FOR attr IN ('area' AS area, 'popu' AS popu)) pvt;

SELECT name, ROUND(popu * 10000 / area, 2) AS 인구밀도 FROM
(
    SELECT * FROM tCityStat
    PIVOT (MAX(value) FOR attr IN ('area' AS area, 'popu' AS popu)) pvt
) A;

WHERE popu IS NOT NULL AND area IS NOT NULL

CREATE TABLE tSeasonPivot AS 
SELECT * FROM tSeason
PIVOT (SUM(sale) FOR season IN ('봄' AS 봄, '여름' AS 여름, '가을' AS 가을, '겨울' AS 겨울));

SELECT * INTO tSeasonPivot FROM tSeason
PIVOT (SUM(sale) FOR season IN (봄, 여름, 가을, 겨울)) pvt;

-- UNPIVOT은 열을 값으로 변환한다.
SELECT * FROM tSeasonPivot
UNPIVOT (sale FOR season IN (봄, 여름, 가을, 겨울)) unpvt;

SELECT * FROM
(
    SELECT * FROM tSeason
    PIVOT (SUM(sale) FOR season IN 
    ('봄' AS 봄, '여름' AS 여름, '가을' AS 가을, '겨울' AS 겨울)) pvt
) A
UNPIVOT (sale FOR season IN (봄, 여름, 가을, 겨울)) unpvt;

 

Comments