First import libraries and data
# !pip install seaborn pandas matplotlib numpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
display.set_matplotlib_formats('svg')
# Alternative to set svg for newer versions
# import matplotlib_inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
data = pd.read_csv('house_sales.zip')
/Users/mli/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (62,91,93,97,98,101,103,106,107,113,134,136,139,140,152,154,155,165,172,175,176,178,187,190,192,193,199,203,204,205,208,209,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,253,254,259,261,262,263,264,265,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,294,295,296,299,301,302,303,304,305,306,307,308,310,311,312,313,314,316,317,318,319,321,322,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,702,703,704,705,706,707,708,709,710,711,712,713,714,715,728,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,867,868,869,870,871,872,874,875,876,877,878,879,880,881,882,883,884,886,887,888,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,960,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,987,989,990,991,992,993,994,995,996,997,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1031,1032,1033,1034,1035,1036,1043,1045,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1078,1080,1081,1082,1083,1084,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1119,1120,1121,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189,1190,1191,1192,1193,1194,1196,1197,1198,1199,1200,1201,1202,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221,1222,1223,1224,1225,1226,1227,1230,1231,1232,1233,1234,1235,1236,1237,1238,1239,1240,1241,1242,1243,1244,1246,1247,1248,1249,1251,1252,1253,1254,1255,1256,1257,1259,1260,1261,1262,1263,1264,1265,1266,1269,1270,1273,1274,1275,1276,1277,1278,1279,1280,1281,1288,1289,1290,1291,1292,1293,1294,1295,1296,1297,1298,1299,1300,1301,1303,1304,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318,1320,1321,1322,1323,1324,1325,1326,1327,1328,1329,1330,1331,1332,1333,1334,1335,1336,1341,1342,1343,1344,1345,1346,1350,1351,1352,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366,1367,1368,1369,1370,1371,1372,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1417,1418,1419,1420,1421,1422,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434,1435,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1514,1516,1517,1519,1520,1521,1522,1523,1524,1525,1526,1527,1528,1530,1531,1532,1533,1534,1535,1538,1540,1541,1542,1543,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558,1559,1560,1561,1562,1563,1564,1565,1566,1567,1568,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1590,1592,1593,1594,1595,1596,1597,1598,1599,1600,1601,1602,1603,1604,1605,1606,1607,1608,1610,1611,1612,1613,1614,1615,1616,1617,1618,1619,1620,1621,1622,1623,1624,1625,1626,1627,1628,1629,1630,1631,1633,1634,1635,1636,1637,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1653,1654,1658,1659,1660,1661,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1677,1679,1680,1681,1682,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1705,1706,1707,1708,1709,1710,1711,1712,1713,1714,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726,1727,1728,1729,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745,1746,1747,1748,1749,1750,1751,1752,1753,1754,1755,1757,1758,1759,1760,1761,1762,1763,1764,1765,1766,1767,1768,1769,1770,1771,1772,1773,1774,1775,1776,1777,1778,1779,1781,1782,1783,1786,1787) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
Let's check the data shape and the first a few examples
data.shape
(164944, 1789)
data.head()
Id | Address | Sold Price | Sold On | Summary | Type | Year built | Heating | Cooling | Parking | ... | Well Disclosure | remodeled | DOH2 | SerialX | Full Baths | Tax Legal Lot Number | Tax Legal Block Number | Tax Legal Tract Number | Building Name | Zip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2080183300 | 11205 Monterey, | $2,000,000 | 01/31/20 | 11205 Monterey, San Martin, CA 95046 is a sing... | SingleFamily | No Data | No Data | No Data | 0 spaces | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 95046 |
1 | 20926300 | 5281 Castle Rd, | $2,100,000 | 02/25/21 | Spectacular Mountain and incredible L.A. City ... | SingleFamily | 1951 | Central | Central Air, Dual | Driveway, Driveway - Brick | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 91011 |
2 | 19595300 | 3581 Butcher Dr, | $1,125,000 | 11/06/19 | Eichler Style home! with Santa Clara High! in ... | SingleFamily | 1954 | Central Forced Air - Gas | Central AC | Garage, Garage - Attached, Covered | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 95051 |
3 | 300472200 | 2021 N Milpitas Blvd, | $36,250,000 | 10/02/20 | 2021 N Milpitas Blvd, Milpitas, CA 95035 is a ... | Apartment | 1989 | Other | No Data | Mixed, Covered | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 95035 |
4 | 2074492000 | LOT 4 Tool Box Spring Rd, | $140,000 | 10/19/20 | Beautiful level lot dotted with pine trees ro... | VacantLand | No Data | No Data | No Data | 0 spaces | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 92561 |
5 rows × 1789 columns
We drop columns that at least 30% values are null to simplify our EDA.
null_sum = data.isnull().sum()
data.columns[null_sum < len(data) * 0.3] # columns will keep
Index(['Id', 'Address', 'Sold Price', 'Sold On', 'Summary', 'Type', 'Year built', 'Heating', 'Cooling', 'Parking', 'Bedrooms', 'Bathrooms', 'Total interior livable area', 'Total spaces', 'Garage spaces', 'Home type', 'Region', 'Elementary School', 'Elementary School Score', 'Elementary School Distance', 'High School', 'High School Score', 'High School Distance', 'Heating features', 'Parking features', 'Lot size', 'Parcel number', 'Tax assessed value', 'Annual tax amount', 'Listed On', 'Listed Price', 'Zip'], dtype='object')
data.drop(columns=data.columns[null_sum > len(data) * 0.3], inplace=True)
Next we check the data types
data.dtypes
Id int64 Address object Sold Price object Sold On object Summary object Type object Year built object Heating object Cooling object Parking object Bedrooms object Bathrooms float64 Total interior livable area object Total spaces float64 Garage spaces float64 Home type object Region object Elementary School object Elementary School Score float64 Elementary School Distance float64 High School object High School Score float64 High School Distance float64 Heating features object Parking features object Lot size object Parcel number object Tax assessed value object Annual tax amount object Listed On object Listed Price object Zip int64 dtype: object
Convert currency from string format such as $1,000,000
to float.
currency = ['Sold Price', 'Listed Price', 'Tax assessed value', 'Annual tax amount']
for c in currency:
data[c] = data[c].replace(
r'[$,-]', '', regex=True).replace(
r'^\s*$', np.nan, regex=True).astype(float)
Also convert areas from string format such as 1000 sqft
and 1 Acres
to float as well.
areas = ['Total interior livable area', 'Lot size']
for c in areas:
acres = data[c].str.contains('Acres') == True
col = data[c].replace(r'\b sqft\b|\b Acres\b|\b,\b','', regex=True).astype(float)
col[acres] *= 43560
data[c] = col
Now we can check values of the numerical columns. You could see the min and max values for several columns do not make sense.
data.describe()
Id | Sold Price | Bathrooms | Total interior livable area | Total spaces | Garage spaces | Elementary School Score | Elementary School Distance | High School Score | High School Distance | Lot size | Tax assessed value | Annual tax amount | Listed Price | Zip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.649440e+05 | 1.648590e+05 | 141791.000000 | 1.465450e+05 | 156738.000000 | 156736.000000 | 145676.000000 | 146288.000000 | 144511.000000 | 145451.000000 | 1.358450e+05 | 1.450650e+05 | 1.433500e+05 | 1.250060e+05 | 164944.000000 |
mean | 2.791434e+08 | 1.194842e+06 | 2.303087 | 3.182221e+03 | 1.706044 | 1.607614 | 5.654892 | 1.260918 | 6.086485 | 2.573214 | 9.525061e+05 | 8.898781e+05 | 1.123415e+04 | 1.197671e+06 | 93084.811172 |
std | 6.424318e+08 | 3.336365e+06 | 1.646634 | 4.609881e+05 | 28.802242 | 28.782370 | 2.098547 | 2.888909 | 2.033379 | 4.067624 | 1.357197e+08 | 3.126888e+06 | 3.859389e+04 | 2.874721e+06 | 2265.021138 |
min | 7.387732e+06 | 1.000000e+00 | 0.000000 | 1.000000e+00 | -26.000000 | -26.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 85611.000000 |
25% | 1.913563e+07 | 4.350000e+05 | 2.000000 | 1.170000e+03 | 0.000000 | 0.000000 | 4.000000 | 0.300000 | 5.000000 | 0.800000 | 4.800000e+03 | 2.550000e+05 | 3.434250e+03 | 4.990000e+05 | 90232.000000 |
50% | 2.059865e+07 | 8.050000e+05 | 2.000000 | 1.558000e+03 | 1.000000 | 1.000000 | 6.000000 | 0.500000 | 6.000000 | 1.400000 | 6.603000e+03 | 5.635010e+05 | 7.372000e+03 | 8.490000e+05 | 94066.000000 |
75% | 8.923942e+07 | 1.370000e+06 | 3.000000 | 2.144000e+03 | 2.000000 | 2.000000 | 7.000000 | 1.000000 | 8.000000 | 2.500000 | 1.209000e+04 | 1.033832e+06 | 1.321300e+04 | 1.395000e+06 | 95053.000000 |
max | 2.147000e+09 | 8.660000e+08 | 256.000000 | 1.764164e+08 | 9999.000000 | 9999.000000 | 10.000000 | 76.400000 | 10.000000 | 77.800000 | 4.856770e+10 | 8.256328e+08 | 9.977342e+06 | 6.250000e+08 | 96155.000000 |
We filter out houses whose living areas are too small or too hard to simplify the visualization later.
abnormal = (data[areas[1]] < 10) | (data[areas[1]] > 1e4)
data = data[~abnormal]
sum(abnormal)
41000
Let's check the histogram of the 'Sold Price'
, which is the target we want to predict.
ax = sns.histplot(np.log10(data['Sold Price']))
ax.set_xlim([3, 8])
ax.set_xticks(range(3, 9))
ax.set_xticklabels(['%.0e'%a for a in 10**ax.get_xticks()]);
A house has different types. Here are the top types:
data['Type'].value_counts()[0:20]
SingleFamily 74318 Condo 18749 MultiFamily 6586 VacantLand 6199 Townhouse 5846 Unknown 5390 MobileManufactured 2588 Apartment 1416 Cooperative 161 Residential Lot 75 Single Family 69 Single Family Lot 56 Acreage 48 2 Story 39 3 Story 25 Hi-Rise (9+), Luxury 21 RESIDENTIAL 19 Condominium 19 Duplex 19 Mid-Rise (4-8) 17 Name: Type, dtype: int64
Price density for different house types.
types = data['Type'].isin(['SingleFamily', 'Condo', 'MultiFamily', 'Townhouse'])
sns.displot(pd.DataFrame({'Sold Price':np.log10(data[types]['Sold Price']),
'Type':data[types]['Type']}),
x='Sold Price', hue='Type', kind='kde');
Another important measurement is the sale price per living sqft. Let's check the differences between different house types.
data['Price per living sqft'] = data['Sold Price'] / data['Total interior livable area']
ax = sns.boxplot(x='Type', y='Price per living sqft', data=data[types], fliersize=0)
ax.set_ylim([0, 2000]);
We know the location affect the price. Let's check the price for the top 20 zip codes.
d = data[data['Zip'].isin(data['Zip'].value_counts()[:20].keys())]
ax = sns.boxplot(x='Zip', y='Price per living sqft', data=d, fliersize=0)
ax.set_ylim([0, 2000])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);
Last, we visualize the correlation matrix of several columns.
_, ax = plt.subplots(figsize=(6,6))
columns = ['Sold Price', 'Listed Price', 'Annual tax amount', 'Price per living sqft', 'Elementary School Score', 'High School Score']
sns.heatmap(data[columns].corr(),annot=True,cmap='RdYlGn', ax=ax);
This notebook demonstrates the basic technologies for EDA, including
We only explored a small aspect of the data. You are welcome to dive deep into more details.