Improving the look of a plot

This aside will show how we can go about improving the visuals of this graph. This will use some of the topics that we will be covering in later chapters, so you might want to come back to this aside once you’ve been through the material in the visualisation chapter.

import pandas as pd

city_pop_file = "https://bristol-training.github.io/introduction-to-data-analysis-in-python/data/city_pop.csv"
census = pd.read_csv(
    city_pop_file,
    skiprows=5,
    sep=";",
    na_values="-1",
    index_col="year",
)
census

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[1], line 4
      1 import pandas as pd
      3 city_pop_file = "https://bristol-training.github.io/introduction-to-data-analysis-in-python/data/city_pop.csv"
----> 4 census = pd.read_csv(
      5     city_pop_file,
      6     skiprows=5,
      7     sep=";",
      8     na_values="-1",
      9     index_col="year",
     10 )
     11 census

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/common.py:384, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    382 # assuming storage_options is to be interpreted as headers
    383 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 384 with urlopen(req_info) as req:
    385     content_encoding = req.headers.get("Content-Encoding", None)
    386     if content_encoding == "gzip":
    387         # Override compression based on Content-Encoding header

File ~/work/data-analysis-python-1/data-analysis-python-1/.venv/lib/python3.12/site-packages/pandas/io/common.py:289, in urlopen(*args, **kwargs)
    283 """
    284 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    285 the stdlib.
    286 """
    287 import urllib.request
--> 289 return urllib.request.urlopen(*args, **kwargs)

File /usr/lib/python3.12/urllib/request.py:215, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    213 else:
    214     opener = _opener
--> 215 return opener.open(url, data, timeout)

File /usr/lib/python3.12/urllib/request.py:521, in OpenerDirector.open(self, fullurl, data, timeout)
    519 for processor in self.process_response.get(protocol, []):
    520     meth = getattr(processor, meth_name)
--> 521     response = meth(req, response)
    523 return response

File /usr/lib/python3.12/urllib/request.py:630, in HTTPErrorProcessor.http_response(self, request, response)
    627 # According to RFC 2616, "2xx" code indicates that the client's
    628 # request was successfully received, understood, and accepted.
    629 if not (200 <= code < 300):
--> 630     response = self.parent.error(
    631         'http', request, response, code, msg, hdrs)
    633 return response

File /usr/lib/python3.12/urllib/request.py:559, in OpenerDirector.error(self, proto, *args)
    557 if http_err:
    558     args = (dict, 'default', 'http_error_default') + orig_args
--> 559     return self._call_chain(*args)

File /usr/lib/python3.12/urllib/request.py:492, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    490 for handler in handlers:
    491     func = getattr(handler, meth_name)
--> 492     result = func(*args)
    493     if result is not None:
    494         return result

File /usr/lib/python3.12/urllib/request.py:639, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
    638 def http_error_default(self, req, fp, code, msg, hdrs):
--> 639     raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 404: Not Found

The simplest thing you can do is plot the graph with no additional options:

census.plot()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 census.plot()

NameError: name 'census' is not defined

The label on the x-axis is taken directly from the column name that we made into the index, "year". Let’s make it have a capital letter at the start by passing the xlabel argument to plot:

census.plot(
    xlabel="Year",
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 census.plot(
      2     xlabel="Year",
      3 )

NameError: name 'census' is not defined

And then also set a y-axis label in a similar way:

census.plot(
    xlabel="Year",
    ylabel="Population (millions)",
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 census.plot(
      2     xlabel="Year",
      3     ylabel="Population (millions)",
      4 )

NameError: name 'census' is not defined

The y-axis currently starts around 2 which makes the difference between London and the other cities look greater than it actually is. It’s usually a good idea to set your y-axis to start at zero. We can pass a tuple (0, None) to the ylim argument which tells the y-axis to start at 0 and the None tells it to use the default scale for the upper bound:

census.plot(
    xlabel="Year",
    ylabel="Population (millions)",
    ylim=(0, None),
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 census.plot(
      2     xlabel="Year",
      3     ylabel="Population (millions)",
      4     ylim=(0, None),
      5 )

NameError: name 'census' is not defined

This is now a perfectly functional graph. All we might want to do now is to play with the aesthetics a little. Using seaborn we can use their theme which can use nicer fonts and colours:

import seaborn as sns

sns.set_theme()

census.plot(
    xlabel="Year",
    ylabel="Population (millions)",
    ylim=(0, None),
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 5
      1 import seaborn as sns
      3 sns.set_theme()
----> 5 census.plot(
      6     xlabel="Year",
      7     ylabel="Population (millions)",
      8     ylim=(0, None),
      9 )

NameError: name 'census' is not defined

If we want a white background again, we can specify the seaborn style with sns.set_style:

sns.set_style("white")

census.plot(
    xlabel="Year",
    ylabel="Population (millions)",
    ylim=(0, None),
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 3
      1 sns.set_style("white")
----> 3 census.plot(
      4     xlabel="Year",
      5     ylabel="Population (millions)",
      6     ylim=(0, None),
      7 )

NameError: name 'census' is not defined

Or, if we want, we can use seaborn directly as the plotting tool using seaborn’s sns.relplot:

sns.relplot(data=census, kind="line").set(
    xlabel="Year",
    ylabel="Population (millions)",
    ylim=(0, None),
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 sns.relplot(data=census, kind="line").set(
      2     xlabel="Year",
      3     ylabel="Population (millions)",
      4     ylim=(0, None),
      5 )

NameError: name 'census' is not defined

Return to course