In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

An Exploratory Data Analysis on a real Airbnb listings dataset

This public real dataset is taken from https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data.
It concerns the Airbnb listings in New York in 2019. We will do an exploratory data analysis (EDA) on this dataset. Let's begin.
If you want you can see the entire python code by clicking the toggle code button.

In [2]:
from IPython.core.display import display,HTML
display(HTML('<style>.prompt{width: 0px; min-width: 0px; visibility: collapse}</style>'))
In [3]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}
In [4]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
plt.style.use('bmh')
import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:.6f}'.format
import seaborn as sns
import matplotlib
matplotlib.rcParams.update({'font.size': 12, 'font.family': 'Verdana'})
%matplotlib inline
In [5]:
df = pd.read_csv('AB_NYC_2019.csv')

After importing the dataset we check out the first 10 records, in order to get acquainted with the structure of the dataset.

In [6]:
df.head(10)
Out[6]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.647490 -73.972370 Private room 149 1 9 2018-10-19 0.210000 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.753620 -73.983770 Entire home/apt 225 1 45 2019-05-21 0.380000 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.809020 -73.941900 Private room 150 3 0 NaN nan 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.685140 -73.959760 Entire home/apt 89 1 270 2019-07-05 4.640000 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.798510 -73.943990 Entire home/apt 80 10 9 2018-11-19 0.100000 1 0
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris Manhattan Murray Hill 40.747670 -73.975000 Entire home/apt 200 3 74 2019-06-22 0.590000 1 129
6 5121 BlissArtsSpace! 7356 Garon Brooklyn Bedford-Stuyvesant 40.686880 -73.955960 Private room 60 45 49 2017-10-05 0.400000 1 0
7 5178 Large Furnished Room Near B'way 8967 Shunichi Manhattan Hell's Kitchen 40.764890 -73.984930 Private room 79 2 430 2019-06-24 3.470000 1 220
8 5203 Cozy Clean Guest Room - Family Apt 7490 MaryEllen Manhattan Upper West Side 40.801780 -73.967230 Private room 79 2 118 2017-07-21 0.990000 1 0
9 5238 Cute & Cozy Lower East Side 1 bdrm 7549 Ben Manhattan Chinatown 40.713440 -73.990370 Entire home/apt 150 1 160 2019-06-09 1.330000 4 188
In [7]:
print('Number of records -->', df.shape[0])
print('Number of features -->', df.shape[1])
Number of records --> 48895
Number of features --> 16
In [8]:
print('Features type:')
df.dtypes
Features type:
Out[8]:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object
In [9]:
print('Number of null values for each feature:')
df.isnull().sum()
Number of null values for each feature:
Out[9]:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

A recap of the dataset structure:

In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB

In this dataset we will utilize only a subset of features: for example we don't care about the gender of host, because we are not going to study how the listings are distributed on male and female (i'm not saying that it's wrong to mantain that feature, but that it's useless for our aims), so we are going to eliminate the host name feature. Or for example we don't care about how the listings are named (it may be interesting to analize even this feature but i guess it is less important than other aspects of the dataset) so we are going to eliminate also the name feature.
We eliminate also the features with too many null values. At the end, the feature we are interested in examining are:

id ---> it identifies the listing: so id values are unique
host_id ---> it identifies the host id: an host id can be associated with more than a listing, so this feature's values are not unique in the dataset
neighbourhood_group ---> for geolocation analysis
neighbourhood ---> for a more precise geolocation analysis
latitude ---> for the exact geolocation
longitude ---> for the exact geolocation
room_type ---> Very likely different types have different distribution daily prices so it's important to do a distinction
price ---> price for a night
minimum_nights ---> minimum nights to book for a single booking
number_of_reviews ---> number of reviews for a listing

Our purpose is to know how Airbnb prices vary in New York, how they vary between the room types, and which areas of the City have the largest number of listings.

In [11]:
df1 = df[['id', 'host_id', 'neighbourhood_group', 'neighbourhood', \
          'latitude', 'longitude', 'room_type','price','minimum_nights','number_of_reviews']]

First 10 records of dataset used for our purposes.

In [41]:
%store df1
Stored 'df1' (DataFrame)
In [12]:
df1.head(10)
Out[12]:
id host_id neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews
0 2539 2787 Brooklyn Kensington 40.647490 -73.972370 Private room 149 1 9
1 2595 2845 Manhattan Midtown 40.753620 -73.983770 Entire home/apt 225 1 45
2 3647 4632 Manhattan Harlem 40.809020 -73.941900 Private room 150 3 0
3 3831 4869 Brooklyn Clinton Hill 40.685140 -73.959760 Entire home/apt 89 1 270
4 5022 7192 Manhattan East Harlem 40.798510 -73.943990 Entire home/apt 80 10 9
5 5099 7322 Manhattan Murray Hill 40.747670 -73.975000 Entire home/apt 200 3 74
6 5121 7356 Brooklyn Bedford-Stuyvesant 40.686880 -73.955960 Private room 60 45 49
7 5178 8967 Manhattan Hell's Kitchen 40.764890 -73.984930 Private room 79 2 430
8 5203 7490 Manhattan Upper West Side 40.801780 -73.967230 Private room 79 2 118
9 5238 7549 Manhattan Chinatown 40.713440 -73.990370 Entire home/apt 150 1 160

The skimmed dataset ha 48895 records and 10 features.

In [13]:
df1.shape
Out[13]:
(48895, 10)

Which are the unique neighbourhood groups and the unique room types included in the dataset?

In [14]:
print('Neighbourhood group:', pd.unique(df1.neighbourhood_group), '\n', 'Room type:',pd.unique(df1.room_type))
Neighbourhood group: ['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx'] 
 Room type: ['Private room' 'Entire home/apt' 'Shared room']

Let's take from google an image of the New York:

In [15]:
from IPython.display import Image
Image(filename='NY.jpg') 
Out[15]:
In [16]:
import seaborn as sns

Let's plot the scatterplot of the listings of our dataset:

In [17]:
fig, ax = plt.subplots(figsize = (10,10))
sns.scatterplot(x='longitude', y='latitude', hue='neighbourhood_group', ax = ax, s = 20, alpha = 0.2, data=df1);
plt.title('Scatterplot evidencing Airbnb listing density in New York');

The areas with thickened circles are the areas where the number of Airbnb listings is large.


Below (on the left) we have a confirmation of these different listing densities in New York.
Instead on the right we distinguish between the different listings categories.

In [18]:
groupedbyZone = df1.groupby('neighbourhood_group')
fig, ax = plt.subplots(1,2, figsize = (14,6))
sns.countplot(df1['neighbourhood_group'], ax = ax[0], linewidth=1, edgecolor='w')
sns.countplot(df1['neighbourhood_group'], hue = df1['room_type'], ax = ax[1], linewidth=1, edgecolor='w')
ax[0].set_xlabel('Borough', labelpad = 10);
ax[1].set_xlabel('Borough', labelpad = 10);
ax[0].set_ylabel('Listings number');
ax[1].set_ylabel('Listings number');
plt.tight_layout();



In Airbnb the host can specify a minimum number of nights for that listing.
Which are the average minimum number of nights per listing for the various boroughes and room type? You can see below.

In [19]:
sns.catplot('neighbourhood_group', 'minimum_nights', hue = 'room_type', data = df1, 
            kind = 'bar', ci = None, linewidth=1, edgecolor='w', height=8.27, aspect=11.7/8.27)
plt.xlabel('Borough', fontsize = 15, labelpad = 15)
plt.xticks(fontsize = 13)
plt.ylabel('Average minimum nights per listing',fontsize = 17, labelpad = 14);

And which are the number of reviews for the various boroughes and room types?

In [20]:
sns.catplot('neighbourhood_group', y = 'number_of_reviews', hue = 'room_type',  kind = 'bar', 
            ci = None, data = df1, linewidth=1, edgecolor='w', height=8.27, aspect=11.7/8.27)
plt.xlabel('Borough', fontsize = 15, labelpad = 15)
plt.xticks(fontsize = 13)
plt.ylabel('Average number of reviews per listing', fontsize = 17, labelpad = 14);



Now check out the principal prices statistics of these Airbnb listings in all New York.

In [21]:
print(df1.price.describe(), '\n')
print('--> 98th Price percentile:',np.percentile(df1.price, 98), '$')
count   48895.000000
mean      152.720687
std       240.154170
min         0.000000
25%        69.000000
50%       106.000000
75%       175.000000
max     10000.000000
Name: price, dtype: float64 

--> 98th Price percentile: 550.0 $

The maximum price is \$ 10000 and it represents a clear outlier given the fact that the 98th percentile is $ 550.
However, the outlier analysis is a wide topic and i will dive into that in the second part of this study.
For now we'll study the 98th percentile truncated distribution of prices.

Let's calculate the 98th percentile for each borough.

In [22]:
price98thPerc = pd.pivot_table(df1, values = ['price'], index = ['neighbourhood_group'], \
                                aggfunc = lambda x: int(np.percentile(x, 98)))
price98thPerc.rename(columns = {'price' : '98th price percentile'}, inplace = True)
#price98thPerc.iloc[:,0] = price98thPerc.iloc[:,0].map('$ {}'.format)
price98thPerc
Out[22]:
98th price percentile
neighbourhood_group
Bronx 312
Brooklyn 400
Manhattan 714
Queens 300
Staten Island 372

Let's continue our analysis deleting the outlier prices: for each borough we esclude the prices greater than the corresponding 98th percentile.
Indeed, in this exploratory analysis the outliers don't represent our subject of study and they don't invalidate the logic of our analysis.

In [23]:
df1_merged = pd.merge(df1, price98thPerc, left_on ='neighbourhood_group', right_on = price98thPerc.index)
df1_merged.shape
df1_noPriceOutliers = df1_merged[df1_merged['price'] < df1_merged['98th price percentile']]

numberOutliers = df1.shape[0] - df1_noPriceOutliers.shape[0]
print('In all New York there are {} listing extreme prices.'. format(numberOutliers))
print('But they represents only the {} % of total listing prices.'.format(round(numberOutliers / df1.shape[0], 3)))
In all New York there are 1053 listing extreme prices.
But they represents only the 0.022 % of total listing prices.

Now we can plot the scatterplot evidencing the different listing prices in New York without take care of extreme prices. In this way the scatterplot is more informative about the difference of prices relative to different parts of the City. You can see that below.

In [24]:
import urllib
#initializing the figure size
plt.figure(figsize=(14,10))
#loading the png NYC image found on Google and saving to my local folder along with the project
i=urllib.request.urlopen('https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG')
nyc_img=plt.imread(i)
#scaling the image based on the latitude and longitude max and mins for proper output
plt.imshow(nyc_img,zorder=0,extent=[-74.258, -73.7, 40.49,40.92])
ax=plt.gca()
#using scatterplot again
df1_noPriceOutliers.plot(kind='scatter', x='longitude', y='latitude', c='price', ax=ax, 
           cmap=plt.get_cmap('RdBu'), colorbar=True, alpha=0.7);

New York has 5 boroughes but how many neighbourhoods are present in this dataset?

In [25]:
print('There are {} neighbourhoods present in this dataset.'.format(len(df1.neighbourhood.unique())))
There are 221 neighbourhoods present in this dataset.

Which are the ten neighbourhoods with the most Airbnb listings?

In [26]:
intop10  = df1[df1.neighbourhood.isin(list(df1.neighbourhood.value_counts(ascending = False).head(10).index))]
topten = intop10.neighbourhood.value_counts(ascending = False).to_frame()
topten.rename(columns = {'neighbourhood': 'number of listings'}, inplace = True)
topten
Out[26]:
number of listings
Williamsburg 3920
Bedford-Stuyvesant 3714
Harlem 2658
Bushwick 2465
Upper West Side 1971
Hell's Kitchen 1958
East Village 1853
Upper East Side 1798
Crown Heights 1564
Midtown 1545
In [27]:
print('Fraction and Cumulative fraction of top 10 neighbourhood over total listings:')
neighweight = pd.DataFrame([intop10.neighbourhood.value_counts()*100 / df.neighbourhood.value_counts().sum(), 
             np.cumsum(intop10.neighbourhood.value_counts()*100 / df.neighbourhood.value_counts().sum())],\
                index = ['% over total listings in New York','cumulative % over total listing in New York'])
neighweight = neighweight.T
#neighweight.rename(columns = {neighweight.columns[0]:'% over total listings', neighweight.columns[1]: 'cumulative %'})
neighweight.name = 'Top 10 Neighbourhood'
neighweight = neighweight.applymap('{:.1f}'.format)
neighweight
Fraction and Cumulative fraction of top 10 neighbourhood over total listings:
Out[27]:
% over total listings in New York cumulative % over total listing in New York
Williamsburg 8.0 8.0
Bedford-Stuyvesant 7.6 15.6
Harlem 5.4 21.0
Bushwick 5.0 26.1
Upper West Side 4.0 30.1
Hell's Kitchen 4.0 34.1
East Village 3.8 37.9
Upper East Side 3.7 41.6
Crown Heights 3.2 44.8
Midtown 3.2 48.0

So, together, these 10 neighbourhood cover almost the 50% of all New York Airbnb listings.
Where they are located? Below the top 10 most listing-populated neighbourhood are scattered in blue color.

In [28]:
fig, ax = plt.subplots(figsize = (10,10))
sns.scatterplot(x='longitude', y='latitude', hue='neighbourhood_group', ax = ax, s=20, alpha = 0.4, data=df1);
sns.scatterplot(x='longitude', y='latitude', ax = ax, s=20, \
                color = 'b', label = 'Top 10 neighbourhood for Airbnb listings', alpha = 0.8, data = intop10);
ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1);
ax.set_title('Airbnb listings density in New York\n Top 10 dense neighbourhood in Blue');

Where is located Williamsburg, the most listing-populated neighbourhood? Let's see...

In [29]:
fig, ax = plt.subplots(figsize = (10,10))

sns.scatterplot(x='longitude', y='latitude', hue='neighbourhood_group', ax = ax, s=20, alpha = 0.4, data=df1);
sns.scatterplot(x='longitude', y='latitude', ax = ax, s=20, color = 'b', \
                alpha = 0.8, data = intop10[intop10.neighbourhood == 'Williamsburg'], label = 'Williamsburg');
ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1);
ax.set_title('Airbnb listings density in New York\n Williamsburg is the most dense neighbourhood');

Ok. Let's start now the second part of this analysis where we are going to focus about prices.
Check out the distribution of listing prices in all New York (until the 98th percentile).

In [30]:
plt.figure(figsize = (8,8))
df1.price.plot(kind = 'hist', bins = 700, linewidth=1, edgecolor='white')
plt.axis([0,600,0,7000]);
plt.xlabel('Price', labelpad = 10);
plt.ylabel('Number of listings per bin-price', labelpad = 15);
plt.title('Airbnb daily price distribution in New York without outliers', pad = 15);

Let's do a brief digression: this graph is not so enlightening because it doesn't distinguish between the different boroughes and the different types of room.
Nevertheless it tells us one important thing: even without the outliers, the empirical price distribution of New York Airbnb listings is right-skewed.
So, as measure of central tendency i prefer to use the median (more robust), and not the mean.
Therefore as measure of dispersion i prefere the interquartile range (IQR) and not the standard deviation (the standard deviation includes the mean in the formula, so if i don't use the mean as location metric i have to avoid the standard deviation).
Maybe it's better to transform the price distribution to the log-price distribution: indeed the log transformation is appropriate when the empirical distribution can't take negative values and it is right-skewed. Moreover, if this latter transformation is very similar to a normal distribution (it can be verified with a QQ-plot), hence the price distribution follows a log-normal distribution.
But the point is: when we read an airbnb listing we reason in price or in log-prices? In prices of course, so for the moment we keep the original price-distribution. The point is: do we need to know which is the theoretical distribution (it cannot even exist sometimes) that is closest to the empirical price distribution? For the purpose of this analysis i don't think so.
Indeed, an exploratory data analysis (EDA) with graphical means and tables, even with no precise associations with rigid theoretical models or distributions can be sufficient (and clearer) for certain goals.
Sometimes we may need to model in a more formal way, sometimes EDA can be sufficient to grasp what we want to know.

That said, let's plot the empirical price distribution distinguishing by borough:

In [31]:
g = df1_noPriceOutliers.groupby(['neighbourhood_group'])
import warnings
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(figsize = (16,11))
for _ , (k, group) in enumerate(g):
    #ax[i].set_title(k)
    group.price.hist(normed = False, ax = ax, bins = 40, label = k, alpha = 0.5, linewidth=1, edgecolor='white')
    ax.legend();
ax.set_title('Price Histogram for borough', fontsize = 16, pad = 18);
ax.set_xlabel('Price', fontsize = 15, labelpad = 12)
ax.set_ylabel('Frequency in absolute value', fontsize = 15, labelpad = 12);

As you've noticed in this not normalized histogram, in Manhattan and Brookling not only price-distribution is more right-skewed (listings are more expensive) but there are also more listings, as you already know examining the first part of this analysis.
Because of last consideration, it's better to zoom the histogram for the Bronx and Staten Island boroughes:

In [32]:
fig, ax = plt.subplots(figsize = (14,10))
import warnings
warnings.filterwarnings('ignore')
for _ , (k, group) in enumerate(g):
    if k in ['Bronx', 'Staten Island']:
        group.price.hist(normed = False, ax = ax, bins = 40, label = k, alpha = 0.5, linewidth=1, edgecolor='white')
        ax.legend();
ax.set_title('Price Histogram for Bronx and Staten Island', fontsize = 16, pad = 18);
ax.set_xlabel('Price', fontsize = 15, labelpad = 12)
ax.set_ylabel('Frequency in absolute value', fontsize = 15, labelpad = 12);

Ok, we now are going to differentiate price distribution not only by borough, but also by room-type.

In [33]:
colors = ['red','tan','blue','green','lime']
fig, ax = plt.subplots(3, 1, figsize = (18,18))
doublegrouped = df1_noPriceOutliers.groupby(['room_type','neighbourhood_group'])
for i, (name, combo) in enumerate(doublegrouped):
    if i <= 4:
        combo.price.plot(kind = 'hist', ax = ax[0], bins = 40, 
                         label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[0].legend()
        ax[0].set_title('Entire home / apt')
    elif 5 <= i <= 9:
        combo.price.plot(kind = 'hist', ax = ax[1], bins = 40, label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[1].legend()
        ax[1].set_title('Private room')
    else:
        combo.price.plot(kind = 'hist', ax = ax[2], bins = 40, label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[2].legend()
        ax[2].set_title('Shared room')
for i in range(3):
    ax[i].set_ylabel('Frequency in absolute value', fontsize = 15, labelpad = 14)
plt.suptitle('Price histograms by room type and borough', fontsize = 20);

Let's zoom below for the Bronx and Staten Island boroughes:

In [34]:
fig, ax = plt.subplots(3, 1, figsize = (16,16))
doublegrouped = df1_noPriceOutliers.groupby(['room_type','neighbourhood_group'])
for i, (name, combo) in enumerate(doublegrouped):
    if i <= 4 and name[1] in ['Bronx', 'Staten Island']:
        combo.price.plot(kind = 'hist', ax = ax[0], bins = 40, 
                         label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[0].legend()
        ax[0].set_title('Entire home / apt')
    elif 5 <= i <= 9 and name[1] in ['Bronx', 'Staten Island']:
        combo.price.plot(kind = 'hist', ax = ax[1], bins = 40, label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[1].legend()
        ax[1].set_title('Private room')
    elif i > 9 and name[1] in ['Bronx', 'Staten Island']:
        combo.price.plot(kind = 'hist', ax = ax[2], bins = 40, label = name, alpha = 0.5, linewidth=1, edgecolor='white');
        ax[2].legend()
        ax[2].set_title('Shared room')
for k in ax:
    k.set_ylabel('Listings number by bin-price', labelpad = 12)
plt.suptitle('Price histogram by room type for Bronx and Staten Island', fontsize = 15);

I think that a recap of the last histograms with a pivot table is useful:

In [35]:
from scipy.stats import iqr
In [36]:
by_room_type = pd.pivot_table(df1, values = ['price'], index = ['room_type','neighbourhood_group'], aggfunc = {"price":[np.median, np.count_nonzero, iqr]})
subtables = []
for row in by_room_type.index.levels[0]:
    subtables.append(by_room_type.loc[[row]].sort_values(by = ('price','median'), ascending = False))
by_room_type = pd.concat(t for t in subtables)

by_room_type[('price','median')] = by_room_type[('price','median')].map('$ {:.0f}'.format)
by_room_type[('price','iqr')] = by_room_type[('price','iqr')].map('$ {:.0f}'.format)
by_room_type[('price','count_nonzero')] = by_room_type[('price','count_nonzero')].map(int)

by_room_type.columns.set_levels(['number listings','IQR','median price'],level=1,inplace=True)
by_room_type.columns = by_room_type.columns.droplevel(0)
by_room_type = by_room_type[['median price', 'IQR','number listings']] # change the column order
by_room_type
Out[36]:
median price IQR number listings
room_type neighbourhood_group
Entire home/apt Manhattan $ 191 $ 116 13198
Brooklyn $ 145 $ 94 9558
Queens $ 120 $ 78 2096
Bronx $ 100 $ 60 379
Staten Island $ 100 $ 76 176
Private room Manhattan $ 90 $ 52 7982
Brooklyn $ 65 $ 30 10126
Queens $ 60 $ 28 3372
Bronx $ 54 $ 30 651
Staten Island $ 50 $ 35 188
Shared room Manhattan $ 69 $ 40 480
Bronx $ 40 $ 28 60
Queens $ 37 $ 24 198
Brooklyn $ 36 $ 20 411
Staten Island $ 30 $ 46 9

A final consideration: we can continue our analysis and investigate fully even the most strange relations and patterns, but the risk is to lose the whole picture. Informations must be assimilated by the analyst and adding more and more informations can be worthy only if they bring a sufficient value factor.
Otherwise is better to truncate the information extracting process and focus the attention on the most relevant informations.