## 測試指令:列出mysql中所有的庫、表
sqoop list-databases \
--connect jdbc:mysql://doit03:3306 \
--username root \
--password root
sqoop list-tables \
--connect jdbc:mysql://doit03:3306/doit_mall \
--username root \
--password root
## 測試指令:從mysql中導入資料到hdfs的指定目錄
## 并行度的問題補充:一個maptask從mysql中擷取資料的速度約為4-5m/s,而mysql伺服器的吞吐量40-50M/s
## 那麼,在mysql中的資料量很大的場景下,可以考慮增加maptask的并行度來提高資料遷移速度
## -m就是用來指定maptask的并行度
## 思考:maptask一旦有多個,那麼它是怎麼劃分處理任務?
## 確定sqoop把目标目錄視作hdfs中的路徑,需要參數配置正确:
# core-site.xml
# <property>
# <name>fs.defaultFS</name>
# <value>hdfs://h1:8020/</value>
# </property>
## 確定sqoop把mr任務送出到yarn上運作,需要參數配置正确:
# mapred-site.xml
# <property>
# <name>mapreduce.framework.name</name>
# <value>yarn</value>
# </property>
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_base \
--target-dir /sqoopdata/doit_jw_stu_base \
--fields-terminated-by ',' \
#如果目标路徑已存在則删除
--delete-target-dir \
--split-by stu_id \
-m 2
# 可以指定要生成的檔案的類型
--as-avrodatafile
--as-parquetfile
--as-sequencefile
--as-textfile
## 如果需要壓縮
--compression-codec gzip
## 空值處理
# 輸入方向:
--input-null-non-string <null-str>
--input-null-string <null-str>
# 輸出方向:
--null-non-string <null-str>
--null-string <null-str>
## 如果沒有數字主鍵,也可以使用文本列來作為切分task的參照,但是需要增加一個-D參數,如下
sqoop import -Dorg.apache.sqoop.splitter.allow_text_splitter=true \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password root \
--table noid \
--target-dir /sqooptest3 \
--fields-terminated-by ',' \
--split-by name \
-m 2
## 導入mysql資料到hive
## 它的實質: 是先将資料從mysql導入hdfs,然後利用hive的中繼資料操作jar包,去hive的中繼資料庫中生成相應的中繼資料,并将資料檔案導入hive表目錄
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_base \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_base \
--delete-target-dir \
--as-textfile \
--fields-terminated-by ',' \
--compress \
--compression-codec gzip \
--split-by stu_id \
--null-string '\\N' \
--null-non-string '\\N' \
--hive-overwrite \
-m 2
# --hive-database xdb
## 條件導入: --where
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_base \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_base2 \
--delete-target-dir \
--as-textfile \
--fields-terminated-by ',' \
--compress \
--compression-codec gzip \
--split-by stu_id \
--null-string '\\N' \
--null-non-string '\\N' \
--hive-overwrite \
--where "stu_age>25" \
-m 2
## 條件導入: --columns 指定要導的字段
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_base \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_base3 \
--delete-target-dir \
--as-textfile \
--fields-terminated-by ',' \
--compress \
--compression-codec gzip \
--split-by stu_id \
--null-string '\\N' \
--null-non-string '\\N' \
--hive-overwrite \
--where "stu_age>25" \
--columns "stu_id,stu_name,stu_phone" \
-m 2
## 查詢導入: --query
# 有了--query,就不要有--table了,也不要有--where了,也不要有--columns了
## query自由查詢導入時,sql語句中必須帶 $CONDITIONS條件 : where $CONDITIONS ,要麼 where id>20 and $CONDITIONS
## 為什麼呢?因為sqoop要将你的sql語句交給多個不同的maptask執行,每個maptask執行sql時肯定要按任務規劃加範圍條件,
## 是以就提供了這個$CONDITIONS作為将來拼接條件的占位符
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_base4 \
--as-textfile \
--fields-terminated-by ',' \
--compress \
--compression-codec gzip \
--split-by stu_id \
--null-string '\\N' \
--null-non-string '\\N' \
--hive-overwrite \
--query 'select stu_id,stu_name,stu_age,stu_term from doit_jw_stu_base where stu_createtime>"2019-09-24 23:59:59" and stu_sex="1" and $CONDITIONS' \
--target-dir '/user/root/tmp' \
-m 2
## --query可以支援複雜查詢(包含join、子查詢、分組查詢)但是,一定要去深入思考你的sql的預期運算邏輯和maptask并行分任務的事實!
# --query "select id,member_id,order_sn,receiver_province from doit_mall.oms_order where id>20 and \$CONDITIONS"
# --query 'select id,member_id,order_sn,receiver_province from doit_mall.oms_order where id>20 and $CONDITIONS'
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_base6 \
--as-textfile \
--fields-terminated-by ',' \
--compress \
--compression-codec gzip \
--split-by id \
--null-string '\\N' \
--null-non-string '\\N' \
--hive-overwrite \
--query 'select b.id,a.stu_id,a.stu_name,a.stu_phone,a.stu_sex,b.stu_zsroom from doit_jw_stu_base a join doit_jw_stu_zsgl b on a.stu_id=b.stu_id where $CONDITIONS' \
--target-dir '/user/root/tmp' \
-m 2
## --增量導入 1 --根據一個遞增字段來界定增量資料
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_zsgl \
--hive-import \
--hive-table yiee_dw.doit_jw_stu_zsgl \
--split-by id \
--incremental append \
--check-column id \
--last-value 40 \
-m 2
## --增量導入 2 --根據修改時間來界定增量資料, 要求必須有一個時間字段,且該字段會跟随資料的修改而修改
## lastmodified 模式下的增量導入,不支援hive導入
sqoop import \
--connect jdbc:mysql://h3:3306/ry \
--username root \
--password haitao.211123 \
--table doit_jw_stu_zsgl \
--target-dir '/sqoopdata/doit_jw_stu_zsgl' \
--incremental lastmodified \
--check-column stu_updatetime \
--last-value '2019-09-30 23:59:59' \
--fields-terminated-by ',' \
--merge-key id \
-m 1
# 導入後的資料是直接追加,還是進行新舊合并,兩個選擇:
--append # 導入的增量資料直接以追加的方式進入目标存儲
--merge-key id \ #導入的增量資料不會簡單地追加到目标存儲,還會将新舊資料進行合并
## 附錄: 資料導入參數大全!
Table 3. Import control arguments:
Argument Description
--append Append data to an existing dataset in HDFS
--as-avrodatafile Imports data to Avro Data Files
--as-sequencefile Imports data to SequenceFiles
--as-textfile Imports data as plain text (default)
--as-parquetfile Imports data to Parquet Files
--boundary-query <statement> Boundary query to use for creating splits
--columns <col,col,col…> Columns to import from table
--delete-target-dir Delete the import target directory if it exists
--direct Use direct connector if exists for the database
--fetch-size <n> Number of entries to read from database at once.
--inline-lob-limit <n> Set the maximum size for an inline LOB
-m,--num-mappers <n> Use n map tasks to import in parallel
-e,--query <statement> Import the results of statement.
--split-by <column-name> Column of the table used to split work units. Cannot be used with --autoreset-to-one-mapper option.
--split-limit <n> Upper Limit for each split size. This only applies to Integer and Date columns. For date or timestamp fields it is calculated in seconds.
--autoreset-to-one-mapper Import should use one mapper if a table has no primary key and no split-by column is provided. Cannot be used with --split-by <col> option.
--table <table-name> Table to read
--target-dir <dir> HDFS destination dir
--temporary-rootdir <dir> HDFS directory for temporary files created during import (overrides default "_sqoop")
--warehouse-dir <dir> HDFS parent for table destination
--where <where clause> WHERE clause to use during import
-z,--compress Enable compression
--compression-codec <c> Use Hadoop codec (default gzip)
--null-string <null-string> The string to be written for a null value for string columns
--null-non-string <null-string> The string to be written for a null value for non-string columns
## sqoop導出資料
sqoop export \
--connect jdbc:mysql://h3:3306/dicts \
--username root \
--password haitao.211123 \
--table dau_t \
--export-dir '/user/hive/warehouse/dau_t' \
--batch # 以batch模式去執行sql
## 控制新舊資料導到mysql時,選擇更新模式
sqoop export \
--connect jdbc:mysql://h3:3306/doit_mall \
--username root \
--password root \
--table person \
--export-dir '/export3/' \
--input-null-string 'NaN' \
--input-null-non-string 'NaN' \
--update-mode allowinsert \
--update-key id \
--batch
## 附錄:export控制參數清單
Table 29. Export control arguments:
Argument Description
--columns <col,col,col…> Columns to export to table
--direct Use direct export fast path
--export-dir <dir> HDFS source path for the export
-m,--num-mappers <n> Use n map tasks to export in parallel
--table <table-name> Table to populate
--call <stored-proc-name> Stored Procedure to call
--update-key <col-name> Anchor column to use for updates. Use a comma separated list of columns if there are more than one column.
--update-mode <mode> Specify how updates are performed when new rows are found with non-matching keys in database.
Legal values for mode include updateonly (default) and allowinsert.
--input-null-string <null-string> The string to be interpreted as null for string columns
--input-null-non-string <null-string> The string to be interpreted as null for non-string columns
--staging-table <staging-table-name> The table in which data will be staged before being inserted into the destination table.
--clear-staging-table Indicates that any data present in the staging table can be deleted.
--batch Use batch mode for underlying statement execution.
## 附錄:
-- mysql修改庫、表編碼
修改庫的編碼:
mysql> alter database db_name character set utf8;
修改表的編碼:
mysql> ALTER TABLE table_name CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
多易教育,專注大資料教育訓練; 課程引領市場,就業乘風破浪
[多易教育官網位址](https://www.51doit.cn/?utm_platform=csdn&utm_campain=coupon)
https://www.51doit.cn
[多易教育線上學習平台](https://v.51doit.cn/?utm_platform=csdn_ck)
https://v.51doit.cn