crfpp_datasets_segmentation.ipynb (Source)
In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
def read_sentences(filepath_or_buffer, sep='\t', header=None,
skip_blank_lines=False):
# Don't skip blanklines so they can be read as NaNs
df_all = pd.read_csv(filepath_or_buffer,
sep=sep, header=header,
skip_blank_lines=skip_blank_lines)
# Find the location of rows with all NaNs (i.e. blank lines)
indices_linebreak, *tail = np.where(df_all.isna()
.all(axis='columns'))
# Split big dataframe into list of dataframes
dfs = np.vsplit(df_all, indices_linebreak+1)
# Recombine list of dataframes into single dataframe with MultiIndex
return pd.concat(map(lambda df: df.reset_index(drop=True) \
.dropna(axis='index', how='all'), dfs),
keys=range(len(dfs)), axis='index')
In [4]:
seg_sentences_train = read_sentences('../example/seg/train.data')
The number of sequences $N_{seq}$ is:
In [5]:
len(seg_sentences_train.index.levels[0])
Out[5]:
The total number of tokens (words) $N = \sum_{n=1}^{N_{seq}} T_n$ is:
In [6]:
len(seg_sentences_train)
Out[6]:
The length $T_1$ of sequence 1 is 38:
In [7]:
seg_sentences_train.loc[0].T
Out[7]:
In [8]:
seg_sentences_train.loc[1].T
Out[8]:
In [9]:
seg_sentences_train.loc[35].T
Out[9]:
In [10]:
seg_sentences_train[0].nunique()
Out[10]:
In [11]:
seg_sentences_train[0].value_counts()
Out[11]: