《算法筆記》5. 字首樹、桶排序、排序算法總結

1 字首樹結構(trie)、桶排序、排序總結
- 1.1 字首樹結構
- 1.2 不基于比較的排序-桶排序
  - 1.2.1 計數排序
  - 1.2.2 基數排序
- 1.3 排序算法的穩定性
  - 1.3.1 穩定的排序
  - 1.3.2 不穩定的排序
  - 1.3.3 排序穩定性對比
- 1.4 排序算法總結
- 1.5 排序常見的坑點
- 1.6 工程上對排序的改進

1 字首樹結構(trie)、桶排序、排序總結

1.1 字首樹結構

單個字元串中，字元從前到後的加到一顆多叉樹上

字元放在路上，節點上有專屬的資料項(常見的是pass和end值)

所有樣本都這樣添加。如果沒有路就建立，如果有路就複用

沿途節點的pass值增加1.每個字元串結束時來到的節點end值增加1

一個字元串數組中，所有字元串的字元數為N,整個數組加入字首樹種的代價是O(N)

功能一：建構好字首樹之後，我們查詢某個字元串在不在字首樹中，某字元串在這顆字首樹中出現了幾次都是特别友善的。例如找"ab"在字首樹中存在幾次，可以先看有無走向a字元的路徑(如果沒有，直接不存在)，再看走向b字元的路徑，此時檢查該節點的end标記的值，如果為0，則字首樹中不存在"ab"字元串，如果e>0則，e等于幾則"ab"在字首樹種出現了幾次

功能二：如果單單是功能一，那麼哈希表也可以實作。現查詢所有加入到字首樹的字元串，有多少個以"a"字元作為字首，來到"a"的路徑，檢視p值大小，就是以"a"作為字首的字元串數量

package class05;

import java.util.HashMap;

// 該程式完全正确
public class Code02_TrieTree {

	public static class Node1 {
	    // pass表示字元從該節點的路徑通過
		public int pass;
		// end表示該字元到此節點結束
		public int end;
		public Node1[] nexts;

		public Node1() {
			pass = 0;
			end = 0;
			// 每個節點下預設26條路，分别是a~z
			// 0    a
			// 1    b
			// 2    c
			// ..   ..
			// 25   z
			// nexts[i] == null   i方向的路不存在
			// nexts[i] != null   i方向的路存在
			nexts = new Node1[26];
		}
	}

	public static class Trie1 {
	   // 預設隻留出頭節點
		private Node1 root;

		public Trie1() {
			root = new Node1();
		}

        // 往該字首樹中添加字元串
		public void insert(String word) {
			if (word == null) {
				return;
			}
			char[] str = word.toCharArray();
			// 初始引用指向頭節點
			Node1 node = root;
			// 頭結點的pass首先++
			node.pass++;
			// 路徑的下标
			int path = 0;
			for (int i = 0; i < str.length; i++) { // 從左往右周遊字元
			    // 目前字元減去'a'的ascii碼得到需要添加的下個節點下标
				path = str[i] - 'a'; // 由字元，對應成走向哪條路
				// 目前方向上沒有建立節點，即一開始不存在這條路，新開辟
				if (node.nexts[path] == null) {
					node.nexts[path] = new Node1();
				}
				// 引用指向目前來到的節點
				node = node.nexts[path];
				// 目前節點的pass++
				node.pass++;
			}
			// 當新加的字元串所有字元處理結束，最後引用指向的目前節點就是該字元串的結尾節點，end++
			node.end++;
		}

        // 删除該字首樹的某個字元串
		public void delete(String word) {
		    // 首先要查一下該字元串是否加入過
			if (search(word) != 0) {
			    // 沿途pass--
				char[] chs = word.toCharArray();
				Node1 node = root;
				node.pass--;
				int path = 0;
				for (int i = 0; i < chs.length; i++) {
					path = chs[i] - 'a';
					// 在尋找的過程中，pass為0，提前可以得知在本次删除之後，該節點以下的路徑不再需要，可以直接删除。
					// 那麼該節點之下下個方向的節點引用置為空（JVM垃圾回收，相當于該節點下的路徑被删了）
					if (--node.nexts[path].pass == 0) {
						node.nexts[path] = null;
						return;
					}
					node = node.nexts[path];
				}
				// 最後end--
				node.end--;
			}
		}
        // 在該字首樹中查找
		// word這個單詞之前加入過幾次
		public int search(String word) {
			if (word == null) {
				return 0;
			}
			char[] chs = word.toCharArray();
			Node1 node = root;
			int index = 0;
			for (int i = 0; i < chs.length; i++) {
				index = chs[i] - 'a';
				// 尋找該字元串的路徑中如果提前找不到path，就是未加入過，0次
				if (node.nexts[index] == null) {
					return 0;
				}
				node = node.nexts[index];
			}
			// 如果順利把word字元串在字首樹中走完路徑，那麼此時的node對應的end值就是目前word在該字首樹中添加了幾次
			return node.end;
		}

		// 所有加入的字元串中，有幾個是以pre這個字元串作為字首的
		public int prefixNumber(String pre) {
			if (pre == null) {
				return 0;
			}
			char[] chs = pre.toCharArray();
			Node1 node = root;
			int index = 0;
			for (int i = 0; i < chs.length; i++) {
				index = chs[i] - 'a';
				// 走不到最後，就沒有
				if (node.nexts[index] == null) {
					return 0;
				}
				node = node.nexts[index];
			}
			// 順利走到最後，傳回的pass就是有多少個字元串以目前pre為字首的
			return node.pass;
		}
	}


    /**
    * 實作方式二,針對各種字元串，路徑不僅僅是a~z對應的26個，用HashMap<Integer, Node2>表示ascii碼值對應的node。
    **/
	public static class Node2 {
		public int pass;
		public int end;
		public HashMap<Integer, Node2> nexts;

		public Node2() {
			pass = 0;
			end = 0;
			nexts = new HashMap<>();
		}
	}

	public static class Trie2 {
		private Node2 root;

		public Trie2() {
			root = new Node2();
		}

		public void insert(String word) {
			if (word == null) {
				return;
			}
			char[] chs = word.toCharArray();
			Node2 node = root;
			node.pass++;
			int index = 0;
			for (int i = 0; i < chs.length; i++) {
				index = (int) chs[i];
				if (!node.nexts.containsKey(index)) {
					node.nexts.put(index, new Node2());
				}
				node = node.nexts.get(index);
				node.pass++;
			}
			node.end++;
		}

		public void delete(String word) {
			if (search(word) != 0) {
				char[] chs = word.toCharArray();
				Node2 node = root;
				node.pass--;
				int index = 0;
				for (int i = 0; i < chs.length; i++) {
					index = (int) chs[i];
					if (--node.nexts.get(index).pass == 0) {
						node.nexts.remove(index);
						return;
					}
					node = node.nexts.get(index);
				}
				node.end--;
			}
		}

		// word這個單詞之前加入過幾次
		public int search(String word) {
			if (word == null) {
				return 0;
			}
			char[] chs = word.toCharArray();
			Node2 node = root;
			int index = 0;
			for (int i = 0; i < chs.length; i++) {
				index = (int) chs[i];
				if (!node.nexts.containsKey(index)) {
					return 0;
				}
				node = node.nexts.get(index);
			}
			return node.end;
		}

		// 所有加入的字元串中，有幾個是以pre這個字元串作為字首的
		public int prefixNumber(String pre) {
			if (pre == null) {
				return 0;
			}
			char[] chs = pre.toCharArray();
			Node2 node = root;
			int index = 0;
			for (int i = 0; i < chs.length; i++) {
				index = (int) chs[i];
				if (!node.nexts.containsKey(index)) {
					return 0;
				}
				node = node.nexts.get(index);
			}
			return node.pass;
		}
	}

    /**
    * 不用字首樹，純暴力的組織，用來做對數器
    **/
	public static class Right {

		private HashMap<String, Integer> box;

		public Right() {
			box = new HashMap<>();
		}

		public void insert(String word) {
			if (!box.containsKey(word)) {
				box.put(word, 1);
			} else {
				box.put(word, box.get(word) + 1);
			}
		}

		public void delete(String word) {
			if (box.containsKey(word)) {
				if (box.get(word) == 1) {
					box.remove(word);
				} else {
					box.put(word, box.get(word) - 1);
				}
			}
		}

		public int search(String word) {
			if (!box.containsKey(word)) {
				return 0;
			} else {
				return box.get(word);
			}
		}

		public int prefixNumber(String pre) {
			int count = 0;
			for (String cur : box.keySet()) {
				if (cur.startsWith(pre)) {
					count += box.get(cur);
				}
			}
			return count;
		}
	}

	// for test
	public static String generateRandomString(int strLen) {
		char[] ans = new char[(int) (Math.random() * strLen) + 1];
		for (int i = 0; i < ans.length; i++) {
			int value = (int) (Math.random() * 6);
			ans[i] = (char) (97 + value);
		}
		return String.valueOf(ans);
	}

	// for test
	public static String[] generateRandomStringArray(int arrLen, int strLen) {
		String[] ans = new String[(int) (Math.random() * arrLen) + 1];
		for (int i = 0; i < ans.length; i++) {
			ans[i] = generateRandomString(strLen);
		}
		return ans;
	}

	public static void main(String[] args) {
		int arrLen = 100;
		int strLen = 20;
		int testTimes = 100000;
		for (int i = 0; i < testTimes; i++) {
			String[] arr = generateRandomStringArray(arrLen, strLen);
			Trie1 trie1 = new Trie1();
			Trie2 trie2 = new Trie2();
			Right right = new Right();
			for (int j = 0; j < arr.length; j++) {
				double decide = Math.random();
				if (decide < 0.25) {
					trie1.insert(arr[j]);
					trie2.insert(arr[j]);
					right.insert(arr[j]);
				} else if (decide < 0.5) {
					trie1.delete(arr[j]);
					trie2.delete(arr[j]);
					right.delete(arr[j]);
				} else if (decide < 0.75) {
					int ans1 = trie1.search(arr[j]);
					int ans2 = trie2.search(arr[j]);
					int ans3 = right.search(arr[j]);
					if (ans1 != ans2 || ans2 != ans3) {
						System.out.println("Oops!");
					}
				} else {
					int ans1 = trie1.prefixNumber(arr[j]);
					int ans2 = trie2.prefixNumber(arr[j]);
					int ans3 = right.prefixNumber(arr[j]);
					if (ans1 != ans2 || ans2 != ans3) {
						System.out.println("Oops!");
					}
				}
			}
		}
		System.out.println("finish!");

	}

}

1.2 不基于比較的排序-桶排序

例如：一個代表員工年齡的數組，排序。資料範圍有限，對每個年齡做詞頻統計。arr[0~200] = 0,M=200

空間換時間

1.2.1 計數排序

桶排序思想下的排序：計數排序 & 基數排序

1、 桶排序思想下的排序都是不基于比較的排序

2、 時間複雜度為O(N),二維空間複雜複雜度為O(M)

3、 應用範圍有限，需要樣本的資料狀況滿足桶的劃分

缺點：與樣本資料狀況強相關。

1.2.2 基數排序

應用條件：十進制資料，非負

[100,17,29,13,5,27] 進行排序 =>

1、找最高位的那個數的長度，這裡100的長度為3，其他數前補0，得出

[100,017,029,013,005,027]

2、 準備10個桶，對應的數字0~9号桶，每個桶是一個隊列。根據樣本按個位數字對應進桶，相同個位數字進入隊列，再從0号桶以此倒出，隊列先進先出。個位進桶再依次倒出，得出：

[100,013,005,017,027,029]

3、 再把按照個位進桶倒出的樣本，再按十位進桶，再按相同規則倒出得：

[100,005,013,017,027,029]

4、再把得到的樣本按百位進桶，倒出得：

[005,013,017,027,029,100]

此時達到有序！

思想：先按各位數字排序，各位數字排好序，再用十位數字的順序去調整，再按百位次序調整。優先級依次遞增，百位優先級最高，百位優先級一樣預設按照上一層十位的順序...

結論：基于比較的排序，時間複雜度的極限就是O(NlogN)，而不基于比較的排序，時間複雜度可以達到O(N)。在面試或刷題，估算排序的時間複雜度的時候，必須用基于比較的排序來估算

/**
* 計數排序
**/
package class05;

import java.util.Arrays;

public class Code03_CountSort {

    // 計數排序
	// only for 0~200 value
	public static void countSort(int[] arr) {
		if (arr == null || arr.length < 2) {
			return;
		}
		int max = Integer.MIN_VALUE;
		for (int i = 0; i < arr.length; i++) {
			max = Math.max(max, arr[i]);
		}
		int[] bucket = new int[max + 1];
		for (int i = 0; i < arr.length; i++) {
			bucket[arr[i]]++;
		}
		int i = 0;
		for (int j = 0; j < bucket.length; j++) {
			while (bucket[j]-- > 0) {
				arr[i++] = j;
			}
		}
	}

	// for test
	public static void comparator(int[] arr) {
		Arrays.sort(arr);
	}

	// for test
	public static int[] generateRandomArray(int maxSize, int maxValue) {
		int[] arr = new int[(int) ((maxSize + 1) * Math.random())];
		for (int i = 0; i < arr.length; i++) {
			arr[i] = (int) ((maxValue + 1) * Math.random());
		}
		return arr;
	}

	// for test
	public static int[] copyArray(int[] arr) {
		if (arr == null) {
			return null;
		}
		int[] res = new int[arr.length];
		for (int i = 0; i < arr.length; i++) {
			res[i] = arr[i];
		}
		return res;
	}

	// for test
	public static boolean isEqual(int[] arr1, int[] arr2) {
		if ((arr1 == null && arr2 != null) || (arr1 != null && arr2 == null)) {
			return false;
		}
		if (arr1 == null && arr2 == null) {
			return true;
		}
		if (arr1.length != arr2.length) {
			return false;
		}
		for (int i = 0; i < arr1.length; i++) {
			if (arr1[i] != arr2[i]) {
				return false;
			}
		}
		return true;
	}

	// for test
	public static void printArray(int[] arr) {
		if (arr == null) {
			return;
		}
		for (int i = 0; i < arr.length; i++) {
			System.out.print(arr[i] + " ");
		}
		System.out.println();
	}

	// for test
	public static void main(String[] args) {
		int testTime = 500000;
		int maxSize = 100;
		int maxValue = 150;
		boolean succeed = true;
		for (int i = 0; i < testTime; i++) {
			int[] arr1 = generateRandomArray(maxSize, maxValue);
			int[] arr2 = copyArray(arr1);
			countSort(arr1);
			comparator(arr2);
			if (!isEqual(arr1, arr2)) {
				succeed = false;
				printArray(arr1);
				printArray(arr2);
				break;
			}
		}
		System.out.println(succeed ? "Nice!" : "Fucking fucked!");

		int[] arr = generateRandomArray(maxSize, maxValue);
		printArray(arr);
		countSort(arr);
		printArray(arr);

	}

}

下面代碼的思想：

例如原數組[101,003,202,41,302]。得到按個位的詞頻conut數組為[0,2,2,1,0,0,0,0,0,0]。通過conut詞頻累加得到conut'為[0,2,4,5,5,5,5,5,5,5],此時conut'的含義表示個位數字小于等于0的數字有0個，個位數字小于等于1的有兩個，個位數字小于等于2的有4個......

得到conut'之後，對原數組[101,003,202,41,302]從右往左周遊。根據基數排序的思想，302應該是2号桶最後被倒出的，我們已經知道個位數字小于等于2的有4個，那麼302就是4個中的最後一個，放在help數組的3号位置,相應的conut'小于等于2位置的詞頻減減變為3。同理，41是1号桶的最後一個，個位數字小于等于1的數字有兩個，那麼41需要放在1号位置，小于等于1位置的詞頻減減變為1，同理......

實質增加conut和count'結構，避免申請十個隊列結構，不想炫技直接申請10個隊列結構，按基數排序思想直接做沒問題

實質上，基數排序的時間複雜度是O(Nlog10max(N)),log10N表示十進制的數的位數，但是我們認為基數排序的應用樣本範圍不大。如果要排任意位數的值，嚴格上就是O(Nlog10max(N))

/**
* 基數排序
**/
package class05;

import java.util.Arrays;

public class Code04_RadixSort {

    // 非負數，十進制，如果負數需要深度改寫這個方法
	// only for no-negative value
	public static void radixSort(int[] arr) {
		if (arr == null || arr.length < 2) {
			return;
		}
		radixSort(arr, 0, arr.length - 1, maxbits(arr));
	}

    // 計算數組樣本中最大值的位數
	public static int maxbits(int[] arr) {
		int max = Integer.MIN_VALUE;
		for (int i = 0; i < arr.length; i++) {
			max = Math.max(max, arr[i]);
		}
		int res = 0;
		while (max != 0) {
			res++;
			max /= 10;
		}
		return res;
	}

	// arr[l..r]排序  ,  digit：最大值的位數
	// l..r    [3, 56, 17, 100]    3
	public static void radixSort(int[] arr, int L, int R, int digit) {
	    // 由于十進制的數，我們依10位基底
		final int radix = 10;
		int i = 0, j = 0;
		// 有多少個數準備多少個輔助空間
		int[] help = new int[R - L + 1];
		for (int d = 1; d <= digit; d++) { // 有多少位就進出幾次
			// 10個空間
		    // count[0] 目前位(d位)是0的數字有多少個
			// count[1] 目前位(d位)是(0和1)的數字有多少個
			// count[2] 目前位(d位)是(0、1和2)的數字有多少個
			// count[i] 目前位(d位)是(0~i)的數字有多少個
			int[] count = new int[radix]; // count[0..9]
			for (i = L; i <= R; i++) {
				// 103的話  d是1表示個位 取出j=3
				// 209  1   9
				j = getDigit(arr[i], d);
				count[j]++;
			}
			// conut往conut'的轉化
			for (i = 1; i < radix; i++) {
				count[i] = count[i] + count[i - 1];
			}
			// i從最後位置往前看
			for (i = R; i >= L; i--) {
				j = getDigit(arr[i], d);
				help[count[j] - 1] = arr[i];
				// 詞頻--
				count[j]--;
			}
			// 處理完個位十位...之後都要往原數組copy
			for (i = L, j = 0; i <= R; i++, j++) {
				arr[i] = help[j];
			}
		}
		
		
		
		
	}

	public static int getDigit(int x, int d) {
		return ((x / ((int) Math.pow(10, d - 1))) % 10);
	}

	// for test
	public static void comparator(int[] arr) {
		Arrays.sort(arr);
	}

	// for test
	public static int[] generateRandomArray(int maxSize, int maxValue) {
		int[] arr = new int[(int) ((maxSize + 1) * Math.random())];
		for (int i = 0; i < arr.length; i++) {
			arr[i] = (int) ((maxValue + 1) * Math.random());
		}
		return arr;
	}

	// for test
	public static int[] copyArray(int[] arr) {
		if (arr == null) {
			return null;
		}
		int[] res = new int[arr.length];
		for (int i = 0; i < arr.length; i++) {
			res[i] = arr[i];
		}
		return res;
	}

	// for test
	public static boolean isEqual(int[] arr1, int[] arr2) {
		if ((arr1 == null && arr2 != null) || (arr1 != null && arr2 == null)) {
			return false;
		}
		if (arr1 == null && arr2 == null) {
			return true;
		}
		if (arr1.length != arr2.length) {
			return false;
		}
		for (int i = 0; i < arr1.length; i++) {
			if (arr1[i] != arr2[i]) {
				return false;
			}
		}
		return true;
	}

	// for test
	public static void printArray(int[] arr) {
		if (arr == null) {
			return;
		}
		for (int i = 0; i < arr.length; i++) {
			System.out.print(arr[i] + " ");
		}
		System.out.println();
	}

	// for test
	public static void main(String[] args) {
		int testTime = 500000;
		int maxSize = 100;
		int maxValue = 100000;
		boolean succeed = true;
		for (int i = 0; i < testTime; i++) {
			int[] arr1 = generateRandomArray(maxSize, maxValue);
			int[] arr2 = copyArray(arr1);
			radixSort(arr1);
			comparator(arr2);
			if (!isEqual(arr1, arr2)) {
				succeed = false;
				printArray(arr1);
				printArray(arr2);
				break;
			}
		}
		System.out.println(succeed ? "Nice!" : "Fucking fucked!");

		int[] arr = generateRandomArray(maxSize, maxValue);
		printArray(arr);
		radixSort(arr);
		printArray(arr);

	}

}

1.3 排序算法的穩定性

穩定性是指同樣大小的樣本在排序之後不會改變相對次序。基礎類型穩定性沒意義，用處是按引用傳遞後是否穩定。比如學生有班級和年齡兩個屬性，先按班級排序，再按年齡排序，那麼如果是穩定性的排序，不會破壞之前已經按班級拍好的順序

穩定性排序的應用場景：購物時候，先按價格排序商品，再按好評度排序，那麼好評度實在價格排好序的基礎上。反之不穩定排序會破壞一開始按照價格排好的次序

1.3.1 穩定的排序

1、冒泡排序（處理相等時不交換）

2、插入排序（相等不交換）

3、歸并排序（merge時候，相等先copy左邊的）

1.3.2 不穩定的排序

1、選擇排序

2、快速排序（partion過程無法保證穩定）

3、堆排序（維持堆結構）

1.3.3 排序穩定性對比

排序	時間複雜度	空間複雜度	穩定性
選擇排序	O(N^2)	O(1)	無
冒泡排序	O(N^2)	O(1)	有
插入排序	O(N^2)	O(1)	有
歸并排序	O(NlogN)	O(N)	有
随機快拍	O(NlogN)	O(logN)	無
堆排序	O(NlogN)	O(1)	無
計數排序	O(N)	O(M)	有
堆排序	O(N)	O(N)	有

1.4 排序算法總結

不基于比較的排序，對樣本資料有嚴格要求，不易改寫
基于比較的排序，隻要規定好兩個樣本怎麼比較大小就可以直接複用
基于比較的排序，時間複雜度的極限是O(NlogN)
時間複雜度O(NlogN)、額外空間複雜度低于O(N)，且穩定的基于比較的排序是不存在的
為了絕對的速度選擇快排（快排的常數時間低），為了節省空間選擇堆排序，為了穩定性選歸并

1.5 排序常見的坑點

歸并排序的額為空間複雜度可以變為O(1)。“歸并排序内部緩存法”，但是将會變的不穩定。不考慮穩定不如直接選擇堆排序

“原地歸并排序”是垃圾文章，會讓時間複雜度變成O(N ^2)。時間複雜度退到O(N ^2)不如直接選擇插入排序

快速排序穩定性改進，“01 stable sort”，但是會對樣本資料要求更多。對資料進行限制，不如選擇桶排序

在整形數組中，請把奇數放在數組左邊，偶數放在數組右邊，要求所有奇數之間原始次序不變，所有偶數之間原始次序不變。要求時間複雜度O(N),額為空間複雜度O(1)。這是個01标準的partion，奇偶規則，但是快速排序的partion過程做不到穩定性。是以正常實作不了，學術論文(01 stable sort,不建議碰，比較難)中需要把資料閹割限制之後才能做到

1.6 工程上對排序的改進

穩定性考慮：值傳遞，直接快排，引用傳遞，歸并排序

充分利用O(NlogN)和O(N^2)排序各自的優勢：根據樣本量底層基于多種排序實作，比如樣本量比較小直接選擇插入排序。

比如Java中系統實作的快速排序