Basic Text Handling

Contents

3. Basic Text Handling#

A collection of simple commands to manage text data structures efficiently.

%%capture
#INCLUDING SCIENTIFIC AND NUMERICAL COMPUTING LIBRARIES
#Run this code to make sure that you have all the libraries at one go.
%pylab inline
import os
import pandas as pd

3.1. Examples: Basic Text Handling#

text = "We the People of the United States, in Order to form a more perfect Union, establish Justice, \
        insure domestic Tranquility, provide for the common defence, promote the general Welfare, \
        and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish \
        this Constitution for the United States of America."

type(text)

str

#How many characters including blanks?
len(text)

#Tokenize the words, separating by spaces, periods, commas
x = text.split(" ")
print(x)

['We', 'the', 'People', 'of', 'the', 'United', 'States,', 'in', 'Order', 'to', 'form', 'a', 'more', 'perfect', 'Union,', 'establish', 'Justice,', '', '', '', '', '', '', '', '', 'insure', 'domestic', 'Tranquility,', 'provide', 'for', 'the', 'common', 'defence,', 'promote', 'the', 'general', 'Welfare,', '', '', '', '', '', '', '', '', 'and', 'secure', 'the', 'Blessings', 'of', 'Liberty', 'to', 'ourselves', 'and', 'our', 'Posterity,', 'do', 'ordain', 'and', 'establish', '', '', '', '', '', '', '', '', 'this', 'Constitution', 'for', 'the', 'United', 'States', 'of', 'America.']

#How many words?
len(x)

But this returns words with commas and periods included, which is not desired. So what we need is the regular expressions package, i.e., re. There are several online regex utilities you may use, e.g., https://regex101.com/.

import re
x = re.split('[ ,.]',text)
print(x)

['We', 'the', 'People', 'of', 'the', 'United', 'States', '', 'in', 'Order', 'to', 'form', 'a', 'more', 'perfect', 'Union', '', 'establish', 'Justice', '', '', '', '', '', '', '', '', '', 'insure', 'domestic', 'Tranquility', '', 'provide', 'for', 'the', 'common', 'defence', '', 'promote', 'the', 'general', 'Welfare', '', '', '', '', '', '', '', '', '', 'and', 'secure', 'the', 'Blessings', 'of', 'Liberty', 'to', 'ourselves', 'and', 'our', 'Posterity', '', 'do', 'ordain', 'and', 'establish', '', '', '', '', '', '', '', '', 'this', 'Constitution', 'for', 'the', 'United', 'States', 'of', 'America', '']

#Use a list comprehension to remove spaces
x = [j for j in x if len(j)>0]
print(x)

['We', 'the', 'People', 'of', 'the', 'United', 'States', 'in', 'Order', 'to', 'form', 'a', 'more', 'perfect', 'Union', 'establish', 'Justice', 'insure', 'domestic', 'Tranquility', 'provide', 'for', 'the', 'common', 'defence', 'promote', 'the', 'general', 'Welfare', 'and', 'secure', 'the', 'Blessings', 'of', 'Liberty', 'to', 'ourselves', 'and', 'our', 'Posterity', 'do', 'ordain', 'and', 'establish', 'this', 'Constitution', 'for', 'the', 'United', 'States', 'of', 'America']

len(x)

#Unique words
y = [j.lower() for j in x]
z = unique(y)
print(z)

['a' 'america' 'and' 'blessings' 'common' 'constitution' 'defence' 'do'
 'domestic' 'establish' 'for' 'form' 'general' 'in' 'insure' 'justice'
 'liberty' 'more' 'of' 'ordain' 'order' 'our' 'ourselves' 'people'
 'perfect' 'posterity' 'promote' 'provide' 'secure' 'states' 'the' 'this'
 'to' 'tranquility' 'union' 'united' 'we' 'welfare']

len(z)

3.2. Using List Comprehensions to find specific words#

#Find words greater than 3 characters
[j for j in x if len(j)>3]

['People',
 'United',
 'States',
 'Order',
 'form',
 'more',
 'perfect',
 'Union',
 'establish',
 'Justice',
 'insure',
 'domestic',
 'Tranquility',
 'provide',
 'common',
 'defence',
 'promote',
 'general',
 'Welfare',
 'secure',
 'Blessings',
 'Liberty',
 'ourselves',
 'Posterity',
 'ordain',
 'establish',
 'this',
 'Constitution',
 'United',
 'States',
 'America']

#Find capitalized words
[j for j in x if j.istitle()]

['We',
 'People',
 'United',
 'States',
 'Order',
 'Union',
 'Justice',
 'Tranquility',
 'Welfare',
 'Blessings',
 'Liberty',
 'Posterity',
 'Constitution',
 'United',
 'States',
 'America']

#Find words that begin with c
[j for j in x if j.startswith('c')]

['common']

#Find words that end in t
[j for j in x if j.endswith('t')]

['perfect']

#Find words that contain a
[j for j in x if "a" in set(j.lower())]

['States',
 'a',
 'establish',
 'Tranquility',
 'general',
 'Welfare',
 'and',
 'and',
 'ordain',
 'and',
 'establish',
 'States',
 'America']

"a" in set('perfect')

False

3.3. Other Simple Text Functions#

#Test type of tokens
print(x)
[j for j in x if j.islower()]

['We', 'the', 'People', 'of', 'the', 'United', 'States', 'in', 'Order', 'to', 'form', 'a', 'more', 'perfect', 'Union', 'establish', 'Justice', 'insure', 'domestic', 'Tranquility', 'provide', 'for', 'the', 'common', 'defence', 'promote', 'the', 'general', 'Welfare', 'and', 'secure', 'the', 'Blessings', 'of', 'Liberty', 'to', 'ourselves', 'and', 'our', 'Posterity', 'do', 'ordain', 'and', 'establish', 'this', 'Constitution', 'for', 'the', 'United', 'States', 'of', 'America']

['the',
 'of',
 'the',
 'in',
 'to',
 'form',
 'a',
 'more',
 'perfect',
 'establish',
 'insure',
 'domestic',
 'provide',
 'for',
 'the',
 'common',
 'defence',
 'promote',
 'the',
 'general',
 'and',
 'secure',
 'the',
 'of',
 'to',
 'ourselves',
 'and',
 'our',
 'do',
 'ordain',
 'and',
 'establish',
 'this',
 'for',
 'the',
 'of']

print(x)
[j for j in x if j.isdigit()]

['We', 'the', 'People', 'of', 'the', 'United', 'States', 'in', 'Order', 'to', 'form', 'a', 'more', 'perfect', 'Union', 'establish', 'Justice', 'insure', 'domestic', 'Tranquility', 'provide', 'for', 'the', 'common', 'defence', 'promote', 'the', 'general', 'Welfare', 'and', 'secure', 'the', 'Blessings', 'of', 'Liberty', 'to', 'ourselves', 'and', 'our', 'Posterity', 'do', 'ordain', 'and', 'establish', 'this', 'Constitution', 'for', 'the', 'United', 'States', 'of', 'America']

[]

[j for j in x if j.isalnum()]

['We',
 'the',
 'People',
 'of',
 'the',
 'United',
 'States',
 'in',
 'Order',
 'to',
 'form',
 'a',
 'more',
 'perfect',
 'Union',
 'establish',
 'Justice',
 'insure',
 'domestic',
 'Tranquility',
 'provide',
 'for',
 'the',
 'common',
 'defence',
 'promote',
 'the',
 'general',
 'Welfare',
 'and',
 'secure',
 'the',
 'Blessings',
 'of',
 'Liberty',
 'to',
 'ourselves',
 'and',
 'our',
 'Posterity',
 'do',
 'ordain',
 'and',
 'establish',
 'this',
 'Constitution',
 'for',
 'the',
 'United',
 'States',
 'of',
 'America']

3.4. String operations#

y = '  To be or not to be.  '
print(y.strip())
print(y.rstrip())
print(y.lstrip())
print(y.lower())
print(y.upper())

To be or not to be.
  To be or not to be.
To be or not to be.  
  to be or not to be.  
  TO BE OR NOT TO BE.  

#Return the starting position of the string
print(y.find('be'))
print(y.rfind('be'))

5
18

print(y.replace('be','do'))

  To do or not to do.

# At 80 letters, the longest word ever composed in German is
# "Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft,"
# meaning, the "Association for Subordinate Officials of the Head Office Management of the
# Danube Steamboat Electrical Services."
y = 'Donaudampfschifffahrtselektrizitatenhauptbetriebswerkbauunterbeamtengesellschaft'
ytok = y.split('a')
print(ytok)

['Don', 'ud', 'mpfschifff', 'hrtselektrizit', 'tenh', 'uptbetriebswerkb', 'uunterbe', 'mtengesellsch', 'ft']

print('a'.join(ytok))

Donaudampfschifffahrtselektrizitatenhauptbetriebswerkbauunterbeamtengesellschaft

# This converts the text into a list, and returns non-unique values
print(list(y))

['D', 'o', 'n', 'a', 'u', 'd', 'a', 'm', 'p', 'f', 's', 'c', 'h', 'i', 'f', 'f', 'f', 'a', 'h', 'r', 't', 's', 'e', 'l', 'e', 'k', 't', 'r', 'i', 'z', 'i', 't', 'a', 't', 'e', 'n', 'h', 'a', 'u', 'p', 't', 'b', 'e', 't', 'r', 'i', 'e', 'b', 's', 'w', 'e', 'r', 'k', 'b', 'a', 'u', 'u', 'n', 't', 'e', 'r', 'b', 'e', 'a', 'm', 't', 'e', 'n', 'g', 'e', 's', 'e', 'l', 'l', 's', 'c', 'h', 'a', 'f', 't']