{"cells":[{"cell_type":"markdown","metadata":{"id":"XjTc5n2flYUu"},"source":["# 특성 공학과 규제\n","특성이 2개면 선형회귀는 평면을 학습한다\n","\n","농어의 길이 x 농어의 높이를 새로운 특성으로 만든다 기존의 특성을 사용하여 새로운 특성을 뽑아내는것을 특성공학 이라고 한다."]},{"cell_type":"markdown","metadata":{"id":"fZwhQU2l8tI6"},"source":["## 데이터 준비"]},{"cell_type":"code","execution_count":1,"metadata":{"id":"3kjaTfOqEVwY","executionInfo":{"status":"ok","timestamp":1679372973215,"user_tz":-540,"elapsed":1092,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["import pandas as pd"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8qmTS1RzKRKT","outputId":"070bf23c-6cc0-45a4-8a6c-4a0e1b121ddc","executionInfo":{"status":"ok","timestamp":1679373126662,"user_tz":-540,"elapsed":307,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["[[ 8.4 2.11 1.41]\n"," [13.7 3.53 2. ]\n"," [15. 3.82 2.43]\n"," [16.2 4.59 2.63]\n"," [17.4 4.59 2.94]\n"," [18. 5.22 3.32]\n"," [18.7 5.2 3.12]\n"," [19. 5.64 3.05]\n"," [19.6 5.14 3.04]\n"," [20. 5.08 2.77]\n"," [21. 5.69 3.56]\n"," [21. 5.92 3.31]\n"," [21. 5.69 3.67]\n"," [21.3 6.38 3.53]\n"," [22. 6.11 3.41]\n"," [22. 5.64 3.52]\n"," [22. 6.11 3.52]\n"," [22. 5.88 3.52]\n"," [22. 5.52 4. ]\n"," [22.5 5.86 3.62]\n"," [22.5 6.79 3.62]\n"," [22.7 5.95 3.63]\n"," [23. 5.22 3.63]\n"," [23.5 6.28 3.72]\n"," [24. 7.29 3.72]\n"," [24. 6.38 3.82]\n"," [24.6 6.73 4.17]\n"," [25. 6.44 3.68]\n"," [25.6 6.56 4.24]\n"," [26.5 7.17 4.14]\n"," [27.3 8.32 5.14]\n"," [27.5 7.17 4.34]\n"," [27.5 7.05 4.34]\n"," [27.5 7.28 4.57]\n"," [28. 7.82 4.2 ]\n"," [28.7 7.59 4.64]\n"," [30. 7.62 4.77]\n"," [32.8 10.03 6.02]\n"," [34.5 10.26 6.39]\n"," [35. 11.49 7.8 ]\n"," [36.5 10.88 6.86]\n"," [36. 10.61 6.74]\n"," [37. 10.84 6.26]\n"," [37. 10.57 6.37]\n"," [39. 11.14 7.49]\n"," [39. 11.14 6. ]\n"," [39. 12.43 7.35]\n"," [40. 11.93 7.11]\n"," [40. 11.73 7.22]\n"," [40. 12.38 7.46]\n"," [40. 11.14 6.63]\n"," [42. 12.8 6.87]\n"," [43. 11.93 7.28]\n"," [43. 12.51 7.42]\n"," [43.5 12.6 8.14]\n"," [44. 12.49 7.6 ]]\n"]}],"source":["df = pd.read_csv('https://bit.ly/perch_csv_data')\n","perch_full = df.to_numpy()\n","print(perch_full)"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"PsRC7rvE9SbL","executionInfo":{"status":"ok","timestamp":1679373128241,"user_tz":-540,"elapsed":5,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["import numpy as np\n","\n","perch_weight = np.array(\n"," [5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, \n"," 110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, \n"," 130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, \n"," 197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, \n"," 514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, \n"," 820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, \n"," 1000.0, 1000.0]\n"," )"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"cRKkoWoZ9J0m","executionInfo":{"status":"ok","timestamp":1679373159708,"user_tz":-540,"elapsed":1202,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["from sklearn.model_selection import train_test_split\n","#훈련세트와 테스트 세트로 나누기\n","train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state=42)"]},{"cell_type":"markdown","metadata":{"id":"y5uMFE_8V1tx"},"source":["## 사이킷런의 변환기\n","사이킷런에서는 특성을 만들거나 전처리하기위한 다양한 클래스를 제공한다. 사이킷런에서 이런 클래스를 변환기라고한다.\n"]},{"cell_type":"code","execution_count":9,"metadata":{"id":"EclugdXmSs-L","executionInfo":{"status":"ok","timestamp":1679373221748,"user_tz":-540,"elapsed":270,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["from sklearn.preprocessing import PolynomialFeatures"]},{"cell_type":"code","execution_count":12,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"V5is7cZhKbPU","outputId":"8eee33b0-e291-4501-bef5-4a53df96384b","executionInfo":{"status":"ok","timestamp":1679373259863,"user_tz":-540,"elapsed":7,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["[[1. 2. 3. 4. 6. 9.]]\n"]}],"source":["poly = PolynomialFeatures()\n","poly.fit([[2, 3]])\n","print(poly.transform([[2, 3]]))"]},{"cell_type":"code","execution_count":13,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bKXkK0oJc4nG","outputId":"3bbfa12c-c662-4a7f-bf6f-c6c182f14930","executionInfo":{"status":"ok","timestamp":1679373260128,"user_tz":-540,"elapsed":2,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["[[2. 3. 4. 6. 9.]]\n"]}],"source":["poly = PolynomialFeatures(include_bias=False)\n","poly.fit([[2, 3]])\n","print(poly.transform([[2, 3]]))"]},{"cell_type":"code","execution_count":14,"metadata":{"id":"__kE6eJdNZfm","executionInfo":{"status":"ok","timestamp":1679373261138,"user_tz":-540,"elapsed":2,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["poly = PolynomialFeatures(include_bias=False)\n","\n","poly.fit(train_input)\n","train_poly = poly.transform(train_input)"]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2a_lmkKle4kF","outputId":"73f9b42d-c33d-4f1a-95bc-84e031d7de7e","executionInfo":{"status":"ok","timestamp":1679373261889,"user_tz":-540,"elapsed":347,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["(42, 9)\n"]}],"source":["print(train_poly.shape)"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"X6GUwfXTfKbl","outputId":"4446ab62-619e-45e8-c85e-aefdbe9066ae","executionInfo":{"status":"ok","timestamp":1679373279946,"user_tz":-540,"elapsed":250,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2',\n"," 'x2^2'], dtype=object)"]},"metadata":{},"execution_count":17}],"source":["poly.get_feature_names_out() #어떤 특성의 조합으로 만들어졌는지"]},{"cell_type":"code","execution_count":22,"metadata":{"id":"DJMPxe2mgbOo","executionInfo":{"status":"ok","timestamp":1679373380794,"user_tz":-540,"elapsed":278,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["test_poly = poly.transform(test_input)"]},{"cell_type":"markdown","metadata":{"id":"PdDAslHzNk3H"},"source":["## 다중 회귀 모델 훈련하기\n","다중 회귀모델을 훈련하는 것은 선형회귀 모델을 훈련하는 것과 같다, 다만 여러개의 특성을 이용하여 선형회귀를 수행한다"]},{"cell_type":"code","execution_count":23,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"E9ygy-8WOvIP","outputId":"889b6ae7-4e3e-45ee-f3fd-79d8eec5db3e","executionInfo":{"status":"ok","timestamp":1679373383269,"user_tz":-540,"elapsed":374,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9903183436982125\n"]}],"source":["from sklearn.linear_model import LinearRegression\n","\n","lr = LinearRegression()\n","lr.fit(train_poly, train_target)\n","print(lr.score(train_poly, train_target))"]},{"cell_type":"code","execution_count":25,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GKKyfFcAd7zm","outputId":"dc92e8f0-8837-45c4-b8d5-aa433d01e39a","executionInfo":{"status":"ok","timestamp":1679373390094,"user_tz":-540,"elapsed":273,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9714559911594111\n"]}],"source":["print(lr.score(test_poly, test_target))"]},{"cell_type":"code","execution_count":26,"metadata":{"id":"2fDt5mrReMwU","executionInfo":{"status":"ok","timestamp":1679373391849,"user_tz":-540,"elapsed":1,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["poly = PolynomialFeatures(degree=5, include_bias=False)\n","\n","poly.fit(train_input)\n","train_poly = poly.transform(train_input)\n","test_poly = poly.transform(test_input)"]},{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hcM8R4VHSzR8","outputId":"0a679d51-8087-471b-9fd6-e5b7bd07935a","executionInfo":{"status":"ok","timestamp":1679373394245,"user_tz":-540,"elapsed":7,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["(42, 55)\n"]}],"source":["print(train_poly.shape)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UffVFVTGP8xj","outputId":"06a6358f-3432-4520-abf7-272ace86cc35"},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9999999999996433\n"]}],"source":["lr.fit(train_poly, train_target)\n","print(lr.score(train_poly, train_target))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GtITdlYFg7AY","outputId":"5858264a-117c-4b42-c9d5-5f6e8861cdd1"},"outputs":[{"output_type":"stream","name":"stdout","text":["-144.40579436844948\n"]}],"source":["print(lr.score(test_poly, test_target))\n","#테스트세트에 대한 점수 : 과대적합"]},{"cell_type":"markdown","metadata":{"id":"K2YMPSelQBpO"},"source":["## 규제\n","\n","머신러닝 모델이 훈련세트를 너무 과도하게 학습하지 못하도록 훼방하는것, 선형회귀 모델의 경우 특성에 곱해지는 계수의 크기를 작게 만드는일"]},{"cell_type":"code","execution_count":29,"metadata":{"id":"hCC7wKy3QQrE","executionInfo":{"status":"ok","timestamp":1679373866766,"user_tz":-540,"elapsed":382,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["from sklearn.preprocessing import StandardScaler\n","\n","ss = StandardScaler()\n","ss.fit(train_poly)\n","\n","train_scaled = ss.transform(train_poly)\n","test_scaled = ss.transform(test_poly)"]},{"cell_type":"markdown","metadata":{"id":"qyLI7JQsJ7RQ"},"source":["## 릿지\n","\n","계수를 제곱한 값을 기준으로 규제를 적용"]},{"cell_type":"code","execution_count":30,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"LdNuDNQGQipv","outputId":"354fc57a-a40c-466b-ca1d-3fe42e9fcd9a","executionInfo":{"status":"ok","timestamp":1679373873949,"user_tz":-540,"elapsed":371,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9896101671037343\n"]}],"source":["from sklearn.linear_model import Ridge\n","\n","ridge = Ridge()\n","ridge.fit(train_scaled, train_target)\n","print(ridge.score(train_scaled, train_target))"]},{"cell_type":"code","execution_count":31,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"p5mXLecwhdnF","outputId":"344f680e-b2ff-4fdf-bd22-18f013ad9deb","executionInfo":{"status":"ok","timestamp":1679373897371,"user_tz":-540,"elapsed":309,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9790693977615387\n"]}],"source":["print(ridge.score(test_scaled, test_target))"]},{"cell_type":"code","execution_count":32,"metadata":{"id":"wXd3_Kq6hlbM","executionInfo":{"status":"ok","timestamp":1679373899815,"user_tz":-540,"elapsed":283,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["import matplotlib.pyplot as plt\n","\n","train_score = []\n","test_score = []"]},{"cell_type":"code","execution_count":39,"metadata":{"id":"9MvIvQOrhfqC","executionInfo":{"status":"ok","timestamp":1679374029398,"user_tz":-540,"elapsed":472,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[],"source":["alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]\n","for alpha in alpha_list:\n"," # 릿지 모델을 만듭니다\n"," ridge = Ridge(alpha=alpha)\n"," # 릿지 모델을 훈련합니다\n"," ridge.fit(train_scaled, train_target)\n"," # 훈련 점수와 테스트 점수를 저장합니다\n"," train_score.append(ridge.score(train_scaled, train_target))\n"," test_score.append(ridge.score(test_scaled, test_target))"]},{"cell_type":"code","execution_count":40,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":279},"id":"95DjrJxlhiow","outputId":"33635ce0-b8c0-4737-c19f-3a8fdaaf5f1d","executionInfo":{"status":"ok","timestamp":1679374030630,"user_tz":-540,"elapsed":10,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"display_data","data":{"text/plain":["<Figure size 432x288 with 1 Axes>"],"image/png":"\n"},"metadata":{"needs_background":"light"}}],"source":["plt.plot(np.log10(alpha_list), train_score)\n","plt.plot(np.log10(alpha_list), test_score)\n","plt.xlabel('alpha')\n","plt.ylabel('R^2')\n","plt.show()"]},{"cell_type":"code","execution_count":41,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5S5vhi-vhjzT","outputId":"20dc7997-cddb-4d3b-d1f1-a2bf29b213e1","executionInfo":{"status":"ok","timestamp":1679374032167,"user_tz":-540,"elapsed":8,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9903815817570367\n","0.9827976465386928\n"]}],"source":["ridge = Ridge(alpha=0.1)\n","ridge.fit(train_scaled, train_target)\n","\n","print(ridge.score(train_scaled, train_target))\n","print(ridge.score(test_scaled, test_target))"]},{"cell_type":"markdown","metadata":{"id":"jUph9pH_KA9_"},"source":["## 라쏘\n","\n","라쏘 모델을 훈련하는 것은 릿지와 매우 비슷하다. 릿지 클래스를 라쏘 클래스로 바꾸는 것이 전부이다."]},{"cell_type":"code","execution_count":42,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ymu-jmekh0IK","outputId":"d0bc3b62-0b61-43f8-de8c-76b986d8ceb2","executionInfo":{"status":"ok","timestamp":1679374036229,"user_tz":-540,"elapsed":283,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.989789897208096\n"]}],"source":["from sklearn.linear_model import Lasso\n","\n","lasso = Lasso()\n","lasso.fit(train_scaled, train_target)\n","print(lasso.score(train_scaled, train_target))"]},{"cell_type":"code","execution_count":43,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"t3jO37UMh2iI","outputId":"67ede17f-57b3-4426-9846-0f2471d1217f","executionInfo":{"status":"ok","timestamp":1679374037793,"user_tz":-540,"elapsed":405,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9800593698421883\n"]}],"source":["print(lasso.score(test_scaled, test_target))"]},{"cell_type":"code","execution_count":44,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"uoL2oJ6Ih4Jw","outputId":"a9254b33-8297-459e-b463-64d516c7a172","scrolled":true,"executionInfo":{"status":"ok","timestamp":1679374038598,"user_tz":-540,"elapsed":407,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.878e+04, tolerance: 5.183e+02\n"," model = cd_fast.enet_coordinate_descent(\n","/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.297e+04, tolerance: 5.183e+02\n"," model = cd_fast.enet_coordinate_descent(\n"]}],"source":["train_score = []\n","test_score = []\n","\n","alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]\n","for alpha in alpha_list:\n"," # 라쏘 모델을 만듭니다\n"," lasso = Lasso(alpha=alpha, max_iter=10000)\n"," # 라쏘 모델을 훈련합니다\n"," lasso.fit(train_scaled, train_target)\n"," # 훈련 점수와 테스트 점수를 저장합니다\n"," train_score.append(lasso.score(train_scaled, train_target))\n"," test_score.append(lasso.score(test_scaled, test_target))"]},{"cell_type":"code","execution_count":45,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":279},"id":"7rkH8Dvzh9UI","outputId":"01b6002c-c5fa-4748-be4f-d5c8f350bd0f","executionInfo":{"status":"ok","timestamp":1679374040250,"user_tz":-540,"elapsed":438,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"display_data","data":{"text/plain":["<Figure size 432x288 with 1 Axes>"],"image/png":"\n"},"metadata":{"needs_background":"light"}}],"source":["plt.plot(np.log10(alpha_list), train_score)\n","plt.plot(np.log10(alpha_list), test_score)\n","plt.xlabel('alpha')\n","plt.ylabel('R^2')\n","plt.show()"]},{"cell_type":"code","execution_count":46,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"t4uFD9Flh_Dw","outputId":"1e81d589-ae24-471a-b3cc-90c3482efdbd","executionInfo":{"status":"ok","timestamp":1679374040768,"user_tz":-540,"elapsed":6,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["0.9888067471131867\n","0.9824470598706695\n"]}],"source":["lasso = Lasso(alpha=10)\n","lasso.fit(train_scaled, train_target)\n","\n","print(lasso.score(train_scaled, train_target))\n","print(lasso.score(test_scaled, test_target))"]},{"cell_type":"code","execution_count":47,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"z_bQc3s8Uoai","outputId":"760890a4-5de7-407d-f737-4930d7596e83","executionInfo":{"status":"ok","timestamp":1679374042134,"user_tz":-540,"elapsed":248,"user":{"displayName":"j j","userId":"09320029400633244007"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["40\n"]}],"source":["print(np.sum(lasso.coef_ == 0))"]},{"cell_type":"code","source":[],"metadata":{"id":"UBdvc3JFfkFO"},"execution_count":null,"outputs":[]}],"metadata":{"colab":{"provenance":[{"file_id":"https://github.com/rickiepark/hg-mldl/blob/master/3-3.ipynb","timestamp":1679323526285}]},"kernelspec":{"display_name":"default:Python","language":"python","name":"conda-env-default-py"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.10"}},"nbformat":4,"nbformat_minor":0}