# 本项目 github 仓库:

Harvard-Database-Design-Project

# 项目总结

# 项目概述

本项目旨在设计和实现一个用于存储和分析全球 COVID-19 疫苗接种数据的数据库模型。数据集包括全球不同国家和地区的疫苗接种信息,并定期更新。为了完成数据库的概念模型设计,我们需要对所涉及的数据集进行详细分析,构建关系模式,并实现数据库结构。

# 项目要求

  1. 数据集概述
    • locations.csv: 包含国家名称及其使用的疫苗类型,每一行代表某一国家的最后一次观测。
    • us_state_vaccinations.csv: 包含美国各州的疫苗接种历史数据。
    • vaccinations-by-age-group.csv: 包含各国不同年龄组的疫苗接种历史数据。
    • vaccinations-by-manufacturer.csv: 包含各国使用的不同类型疫苗的历史数据。
    • vaccinations.csv: 包含全球各国的疫苗接种数据,每一行代表一个观测日期。
    • country_data/Wales.csv, Canada.csv, United States.csv, Denmark.csv: 包含这些国家的每日疫苗接种数据。
  2. 数据库设计要求
    • 设计数据库的概念模型,确保符合第三范式 (3NF)。
    • 使用实体关系图 (ERD) 映射为数据库模式。
    • 完成数据库的标准化,消除数据冗余。

# 数据分析

# 数据集分析与关系确定

  1. country_data
    • 包含属性: location , date , vaccine , source_url , total_vaccinations , people_vaccinated , people_fully_vaccinated , total_boosters
    • 关系: (location, date) -> vaccine, source_url, total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters
    • 进一步分析:存在推测关系 total_vaccinations ≈ people_vaccinated + people_fully_vaccinated + total_boosters ,验证后确定可以消除 total_vaccinations
  2. vaccinations-by-age-group
    • 包含属性: location , date , age_group , people_vaccinated_per_hundred , people_fully_vaccinated_per_hundred , people_with_booster_per_hundred
    • 关系: (location, date, age_group) -> people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred, people_with_booster_per_hundred
  3. vaccinations-by-manufacturer
    • 包含属性: location , date , vaccine , total_vaccinations
    • 关系: (location, date, vaccine) -> total_vaccinations
  4. locations
    • 包含属性: location , iso_code , vaccines , last_observation_date , source_name , source_website
    • 关系: (location, last_observation_date) -> vaccines, source_website(source_website) -> source_name
  5. vaccinations
    • 包含属性: location , iso_code , total_vaccinations , people_vaccinated , people_fully_vaccinated , total_boosters , daily_vaccinations_raw , daily_vaccinations , date , total_vaccinations_per_hundred , people_vaccinated_per_hundred , people_fully_vaccinated_per_hundred , total_boosters_per_hundred , daily_vaccinations_per_million , daily_people_vaccinated , daily_people_vaccinated_per_hundred
    • 关系: (location, date) -> people_vaccinated, people_fully_vaccinated, total_boosters, daily_vaccinations, people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred, total_boosters_per_hundred, daily_vaccinations_per_million, daily_people_vaccinated, daily_people_vaccinated_per_hundred
  6. us_state_vaccinations
    • 包含属性: location , date , total_vaccinations , people_vaccinated , people_fully_vaccinated , total_boosters , daily_vaccinations_raw , daily_vaccinations , total_distributed , total_vaccinations_per_hundred , people_vaccinated_per_hundred , people_fully_vaccinated_per_hundred , total_boosters_per_hundred , share_doses_used , daily_people_vaccinated , distributed_per_hundred
    • 关系: (location, date) -> people_vaccinated, people_fully_vaccinated, total_boosters, daily_vaccinations, total_distributed, people_fully_vaccinated_per_hundred, total_boosters_per_hundred, share_doses_used, daily_people_vaccinated, people_vaccinated_per_hundred, distributed_per_hundred

# 数据库实现

# 概念模型设计

使用实体关系图 (ERD) 进行概念模型设计,将数据重要性与残缺程度较好的属性用黑色表示,其他属性存在较大分析困难度,用浅色表示。

# 数据库模式映射

使用七步映射法 (7-step mapping process) 将 ERD 映射为数据库模式,创建以下表项:

  1. Source_Website

    • 属性: url (主键) , source_name
  2. Record_vaccine

    • 属性: location (主键) , date (主键) , vaccine (主键) , total_vaccines
  3. Record_age

    • 属性: location (主键) , date (主键) , age_group (主键) , people_vaccinated_per_hundred , people_fully_vaccinated_per_hundred , total_boosters_per_hundred
  4. Location_Data_Total

    • 属性: location (主键) , date (主键) , vaccines , source_website (外键) , total_vaccination , total_distributed , share_doses_used
  5. Location_Data_non_total

    • 属性: location (主键) , date (主键) , source_website (外键) , people_vaccinated , people_fully_vaccinated , total_boosters , daily_people_vaccinated , daily_vaccinations
  6. US_Location

    • 属性: location (主键)
  7. ISO_Location

    • 属性: location (主键) , iso_code
  8. Location_Data_Per

    • 属性: location (主键) , date (主键) , daily_vaccinations_raw , total_vaccinations_per_hundred , people_vaccinated_per_hundred , people_fully_vaccinated_per_hundred , total_boosters_per_hundred , daily_vaccinations_per_million , daily_people_vaccinated_per_hundred , distributed_per_hundred

    # 表结构的 SQL 代码

    以下是创建上述表的 SQL 代码示例:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    CREATE TABLE Source_Website (
    URL_ VARCHAR(50) PRIMARY KEY,
    NAME_ VARCHAR(30)
    );

    CREATE TABLE Record_vaccine (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    DATE_ DATETIME PRIMARY KEY,
    VACCINE VARCHAR(50) PRIMARY KEY,
    TOTAL_VACCINES INT NOT NULL
    );

    CREATE TABLE Record_age (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    DATE_ DATETIME PRIMARY KEY,
    AGE_GROUP VARCHAR(50) PRIMARY KEY,
    PEOPLE_VACCINATED_HUND INT,
    PEOPLE_FULLY_VACCINATED_HUND INT,
    TOTAL_BOOSTERS_HUND INT
    );

    CREATE TABLE Location_Data_Total (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    DATE_ DATETIME PRIMARY KEY,
    VACCINES VARCHAR(200),
    SOURCE_WEBSITE VARCHAR(50) NOT NULL,
    TOTAL_VACCINATION INT,
    TOTAL_DISTRIBUTED INT,
    SHARE_DOSES_USED INT,
    FOREIGN KEY (SOURCE_WEBSITE) REFERENCES Source_Website(URL_)
    );

    CREATE TABLE Location_Data_non_total (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    DATE_ DATETIME PRIMARY KEY,
    SOURCE_WEBSITE VARCHAR(50) NOT NULL,
    PEOPLE_VACCINATED INT,
    PEOPLE_FULLY_VACCINATED INT,
    TOTAL_BOOSTERS INT,
    DAILY_PEOPLE_VACCINATED INT,
    DAILY_VACCINATIONS INT,
    FOREIGN KEY (SOURCE_WEBSITE) REFERENCES Source_Website(URL_)
    );

    CREATE TABLE US_Location (
    LOCATION_ VARCHAR(50) PRIMARY KEY
    );

    CREATE TABLE ISO_Location (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    ISO_CODE VARCHAR(50) NOT NULL
    );

    CREATE TABLE Location_Data_Per (
    LOCATION_ VARCHAR(50) PRIMARY KEY,
    DATE_ DATETIME PRIMARY KEY,
    DAILY_VACCINATIONS_RAW INT,
    TOTAL_VACCINATIONS_PER_HUNDRED FLOAT,
    PEOPLE_VACCINATED_PER_HUNDRED FLOAT,
    PEOPLE_FULLY_VACCINATED_PERHUNDRED FLOAT,
    TOTAL_BOOSTERS_PER_HUNDRED FLOAT,
    DAILY_VACCINATIONS_PER_MILION INT,
    DAILY_PEOPLE_VACCINATED_PER_HUNDRED FLOAT,
    DISTRIBUTED_PER_HUNDRED FLOAT
    );

    # 数据库标准化

    通过分析各表的属性关系,消除了冗余,确保所有表格均满足第三范式 (3NF),以提高数据存储和查询的效率。

    # 总结

    本项目通过详细的数据分析与设计,构建了一个规范化的数据库模型,用于全球 COVID-19 疫苗接种数据的存储和分析。通过 ERD 设计与数据库模式映射,成功实现了数据的高效存储和查询。标准化过程有效地消除了数据冗余,提高了数据库的性能和可维护性。项目最终成果满足了所有设计要求,确保了数据的完整性和一致性。