Posts Tagged ‘name formatting’

Properizing names on HTML forms

Wednesday, August 6th, 2008

If you’ve ever put a form online that asks the user for their name, you will want to read this post!

How many times have you gotten form data that contains a persons name, and it isn’t capitalized correctly?  Probably much more often than you would like!  Either the name is in all CAPS, all lowercase, or some mixture that just doesn’t agree with the purpose you are using it for.  I got really tired of having to manually format names from online applications and other various forms.  It takes a good chunk of time to scour that data and manually fix up the capitalization of the names – time that could be better spend on other endeavours.

So I set out determined to find some automated way of fixing the data before it ever gets to me.  Mind you, creating an automated proceedure to properly format names is a daunting task.  There are so many name variations and rules involved.  But it has been done before.

It all starts in the genes
John Cardinal makes companion programs for The Master Genealogist (TMG) from Wholly Genes.  He created a function in one of his programs that properly capitalizes names from the TMG database.  He wrote notes on how to do it here.

Then came along Tim Morgan who took John’s notes and developed a routine in Python to perform the capitalization on names passed to the routine.  Once I found his code snippet, I knew that is was something special.  However, I’m not a Python developer – in fact, I know absolutely nothing about Python except that it is a server-side scripting system similar to Perl or PHP.

I do, however, know JavaScript.  And I know that the majority of browsers support JavaScript and the majority of web users have their JavaScript turned on.  So I decided to undertake the task of porting Tim’s Python routine to Javascript where I think it would be more useful.  This post is the fruit of my efforts!

The Good Stuff
So in your form, you will have your name fields, which should pass the value of the field to the JavaScript function.  Here is an example form field:

<input type="text" name="first_name" id="first_name" onchange="this.value = properizeName(this.value);" />

And the text below is the actual JavaScript function:

function properizeName(name) {
	var upperCase = /^[A-Z]/;  //Regexp for all UPPERCASE words
	var suffixes = new Array("II", "(II)", "III", "(III)", "IV", "(IV)", "VI", "(VI)", "VII", "(VII)", "2nd", "(2nd)", "3rd", "(3rd)", "4th", "(4th)", "5th", "(5th)");
	var surnames = new Array("ApShaw", "d'Albini", "d'Aubigney", "d'Aubigne", "d'Autry", "d'Entremont", "d'Hurst", "D'ovidio", "da Graca", "DaSilva", "DeAnda", "deAnnethe", "deAubigne", "deAubigny", "DeBardelaben", "DeBardeleben", "DeBaugh", "deBeauford", "DeBerry", "deBethune", "DeBetuile", "DeBoard", "DeBoer", "DeBohun", "DeBord", "DeBose", "DeBrouwer", "DeBroux", "DeBruhl", "deBruijn", "deBrus", "deBruse", "deBrusse", "DeBruyne", "DeBusk", "DeCamp", "deCastilla", "DeCello", "deClare", "DeClark", "DeClerck", "DeCoste", "deCote", "DeCoudres", "DeCoursey", "DeCredico", "deCuire", "DeCuyre", "DeDominicios", "DeDuyster", "DeDuytscher", "DeDuytser", "deFiennes", "DeFord", "DeForest", "DeFrance", "DeFriece", "DeGarmo", "deGraaff", "DeGraff", "DeGraffenreid", "DeGraw", "DeGrenier", "DeGroats", "DeGroft", "DeGrote", "DeHaan", "DeHaas", "DeHaddeclive", "deHannethe", "DeHatclyf", "DeHaven", "DeHeer", "DeJager", "DeJarnette", "DeJean", "DeJong", "deJonge", "deKemmeter", "deKirketon", "DeKroon", "deKype", "del-Rosario", "dela Chamotte", "DeLa Cuadra", "DeLa Force", "dela Fountaine", "dela Grena", "dela Place", "DeLa Ward", "DeLaci", "DeLacy", "DeLaet", "DeLalonde", "DelAmarre", "DeLancey", "DeLascy", "DelAshmutt", "DeLassy", "DeLattre", "DeLaughter", "DeLay", "deLessine", "DelGado", "DelGaudio", "DeLiberti", "DeLoache", "DeLoatch", "DeLoch", "DeLockwood", "DeLong", "DeLozier", "DeLuca", "DeLucenay", "deLucy", "DeMars", "DeMartino", "deMaule", "DeMello", "DeMinck", "DeMink", "DeMoree", "DeMoss", "DeMott", "DeMuynck", "deNiet", "DeNise", "DeNure", "DePalma", "DePasquale", "dePender", "dePercy", "DePoe", "DePriest", "DePu", "DePui", "DePuis", "DeReeper", "deRochette", "deRose", "DeRossett", "DeRover", "deRuggele", "deRuggle", "DeRuyter", "deSaint-Sauveur", "DeSantis", "desCuirs", "DeSentis", "DeShane", "DeSilva", "DesJardins", "DesMarest", "deSoleure", "DeSoto", "DeSpain", "DeStefano", "deSwaert", "deSwart", "DeVall", "DeVane", "DeVasher", "DeVasier", "DeVaughan", "DeVaughn", "DeVault", "DeVeau", "DeVeault", "deVilleneuve", "DeVilliers", "DeVinney", "DeVito", "deVogel", "DeVolder", "DeVolld", "DeVore", "deVos", "DeVries", "deVries", "DeWall", "DeWaller", "DeWalt", "deWashington", "deWerly", "deWessyngton", "DeWet", "deWinter", "DeWitt", "DeWolf", "DeWolfe", "DeWolff", "DeWoody", "DeYager", "DeYarmett", "DeYoung", "DiCicco", "DiCredico", "DiFillippi", "DiGiacomo", "DiMarco", "DiMeo", "DiMonte", "DiNonno", "DiPietro", "diPilato", "DiPrima", "DiSalvo", "du Bosc", "du Hurst", "DuFort", "DuMars", "DuPre", "DuPue", "DuPuy", "FitzUryan", "kummel", "LaBarge", "LaBarr", "LaBauve", "LaBean", "LaBelle", "LaBerteaux", "LaBine", "LaBonte", "LaBorde", "LaBounty", "LaBranche", "LaBrash", "LaCaille", "LaCasse", "LaChapelle", "LaClair", "LaComb", "LaCoste", "LaCount", "LaCour", "LaCroix", "LaFarlett", "LaFarlette", "LaFerry", "LaFlamme", "LaFollette", "LaForge", "LaFortune", "LaFoy", "LaFramboise", "LaFrance", "LaFuze", "LaGioia", "LaGrone", "LaLiberte", "LaLonde", "LaLone", "LaMaster", "LaMay", "LaMere", "LaMont", "LaMotte", "LaPeer", "LaPierre", "LaPlante", "LaPoint", "LaPointe", "LaPorte", "LaPrade", "LaRocca", "LaRochelle", "LaRose", "LaRue", "LaVallee", "LaVaque", "LaVeau", "LeBleu", "LeBoeuf", "LeBoiteaux", "LeBoyteulx", "LeCheminant", "LeClair", "LeClerc", "LeCompte", "LeCroy", "LeDuc", "LeFevbre", "LeFever", "LeFevre", "LeFlore", "LeGette", "LeGrand", "LeGrave", "LeGro", "LeGros", "LeJeune", "LeMaistre", "LeMaitre", "LeMaster", "LeMesurier", "LeMieux", "LeMoe", "LeMoigne", "LeMoine", "LeNeve", "LePage", "LeQuire", "LeQuyer", "LeRou", "LeRoy", "LeSuer", "LeSueur", "LeTardif", "LeVally", "LeVert", "LoMonaco", "Macabe", "Macaluso", "MacaTasney", "Macaulay", "Macchitelli", "Maccoone", "Maccurry", "Macdermattroe", "Macdiarmada", "Macelvaine", "Macey", "Macgraugh", "Machan", "Machann", "Machum", "Maciejewski", "Maciel", "Mackaben", "Mackall", "Mackartee", "Mackay", "Macken", "Mackert", "Mackey", "Mackie", "Mackin", "Mackins", "Macklin", "Macko", "Macksey", "Mackwilliams", "Maclean", "Maclinden", "Macomb", "Macomber", "Macon", "Macoombs", "Macraw", "Macumber", "Macurdy", "Macwilliams", "MaGuinness", "MakCubyn", "MakCumby", "Mcelvany", "Mcsherry", "Op den Dyck", "Op den Graeff", "regory", "Schweißguth", "StElmo", "StGelais", "StJacques", "te Boveldt", "VanAernam", "VanAken", "VanAlstine", "VanAmersfoort", "VanAntwerp", "VanArlem", "VanArnam", "VanArnem", "VanArnhem", "VanArnon", "VanArsdale", "VanArsdalen", "VanArsdol", "vanAssema", "vanAsten", "VanAuken", "VanAwman", "VanBaucom", "VanBebber", "VanBeber", "VanBenschoten", "VanBibber", "VanBilliard", "vanBlare", "vanBlaricom", "VanBuren", "VanBuskirk", "VanCamp", "VanCampen", "VanCleave", "VanCleef", "VanCleve", "VanCouwenhoven", "VanCovenhoven", "VanCowenhoven", "VanCuren", "VanDalsem", "VanDam", "VanDe Poel", "vanden Dijkgraaf", "vanden Kommer", "VanDer Aar", "vander Gouwe", "VanDer Honing", "VanDer Hooning", "vander Horst", "vander Kroft", "vander Krogt", "VanDer Meer", "vander Meulen", "vander Putte", "vander Schooren", "VanDer Veen", "VanDer Ven", "VanDer Wal", "VanDer Weide", "VanDer Willigen", "vander Wulp", "vander Zanden", "vander Zwan", "VanDer Zweep", "VanDeren", "VanDerlaan", "VanDerveer", "VanderWoude", "VanDeursen", "VanDeusen", "vanDijk", "VanDoren", "VanDorn", "VanDort", "VanDruff", "VanDryer", "VanDusen", "VanDuzee", "VanDuzen", "VanDuzer", "VanDyck", "VanDyke", "VanEman", "VanEmmen", "vanEmmerik", "VanEngen", "vanErp", "vanEssen", "VanFleet", "VanGalder", "VanGelder", "vanGerrevink", "VanGog", "vanGogh", "VanGorder", "VanGordon", "VanGroningen", "VanGuilder", "VanGundy", "VanHaaften", "VanHaute", "VanHees", "vanHeugten", "VanHise", "VanHoeck", "VanHoek", "VanHook", "vanHoorn", "VanHoornbeeck", "VanHoose", "VanHooser", "VanHorn", "VanHorne", "VanHouten", "VanHoye", "VanHuijstee", "VanHuss", "VanImmon", "VanKersschaever", "VanKeuren", "VanKleeck", "VanKoughnet", "VanKouwenhoven", "VanKuykendaal", "vanLeeuwen", "vanLent", "vanLet", "VanLeuven", "vanLingen", "VanLoozen", "VanLopik", "VanLuven", "vanMaasdijk", "VanMele", "VanMeter", "vanMoorsel", "VanMoorst", "VanMossevelde", "VanNaarden", "VanNamen", "VanNemon", "VanNess", "VanNest", "VanNimmen", "vanNobelen", "VanNorman", "VanNormon", "VanNostrunt", "VanNote", "VanOker", "vanOosten", "VanOrden", "VanOrder", "VanOrma", "VanOrman", "VanOrnum", "VanOstrander", "VanOvermeire", "VanPelt", "VanPool", "VanPoole", "VanPoorvliet", "VanPutten", "vanRee", "VanRhijn", "vanRijswijk", "VanRotmer", "VanSchaick", "vanSchelt", "VanSchoik", "VanSchoonhoven", "VanSciver", "VanScoy", "VanScoyoc", "vanSeters", "VanSickle", "VanSky", "VanSnellenberg", "vanStaveren", "VanStraten", "VanSuijdam", "VanTassel", "VanTassell", "VanTessel", "VanTexel", "VanTuyl", "VanValckenburgh", "vanValen", "VanValkenburg", "VanVelsor", "VanVelzor", "VanVlack", "VanVleck", "VanVleckeren", "VanWaard", "VanWart", "VanWassenhove", "VanWinkle", "VanWoggelum", "vanWordragen", "VanWormer", "VanZuidam", "VanZuijdam", "VonAdenbach", "vonAllmen", "vonBardeleben", "vonBerckefeldt", "VonBergen", "vonBreyman", "VonCannon", "vonFreymann", "vonHeimburg", "VonHuben", "vonKramer", "vonKruchenburg", "vonPostel", "VonRohr", "VonRohrbach", "VonSass", "VonSasse", "vonSchlotte", "VonSchneider", "VonSeldern", "VonSpringer", "VonVeyelmann", "VonZweidorff");
	surnames = surnames.concat(suffixes); //Append suffixes array to the end of surnames
	var mc = /^Mc(\w)(?=\w)/i; //Regexp for "Mc"
	var mac = /^Mac(\w)(?=\w)/i; //Regexp for "Mac"
	var hyphen_index = new Array();
	var hyphen = false;
	while (name.indexOf('-') > -1) { //Loops to record positions of hypens (to put them back later) and convert the hypen to a space (to break up name into individual words)
		index = name.indexOf('-');
		if (index == 0) { //If name begins with hypen, just remove the first character from the name and loop again
			name = name.substr(1);
			continue;
		}
		hyphen_index.push(index); //Record hyphen position
		name = name.substring(0, index) + ' ' + name.substr(index+1); //Change hyphen to a space
		hyphen = true;
	}
	var period_index = new Array();
	var period = false;
	while (name.indexOf('.') > -1) { //Loops to record positions of periods (to put them back later) and convert the period to a space (to break up name into individual words)
		index = name.indexOf('.');
		if (index == 0) { //If name begins with period, just remove the first character from the name and loop again
			name = name.substr(1);
			continue;
		}
		period_index.push(index); //Record period position
		name = name.substring(0, index) + ' ' + name.substr(index+1); //Change period to a space
		period = true;
	}
	var names = name.split(' '); //Put individual words in name into an array
	for (i = 0; i < names.length; i++) //Loop through words in name if they are all CAPS, make them all lowercase
		if (upperCase.test(names[i]))
			names[i] = names[i].toLowerCase();
	for (i = 0; i < names.length; i++) //Loop through words in name and capitalize the first letter
		names[i] = names[i].charAt(0).toUpperCase() + names[i].substr(1); //Change word to capitalized version
	for (i = 0; i < names.length; i++) { //Loop through words in name and check for "mcx" and "macx"
		if (mc.test(names[i])) //Look for "Mc" start of name word
			names[i] = "Mc" + names[i].charAt(2).toUpperCase() + names[i].substr(3); //Change word to capitalized version
//		if (mac.test(names[i])) //Look for "Mac" start of name word
//			names[i] = "Mac" + names[i].charAt(3).toUpperCase() + names[i].substr(4); //Change word to capitalized version
	}
	name = names.join(' '); //Join words of name back together
	if (hyphen) //Add hyphens back if they originally existed
		for (i = 0; i < hyphen_index.length; i++) //Cycle through hyphen index
			name = name.substr(0, hyphen_index[i]) + '-' + name.substr(hyphen_index[i]+1);  //Replace positions in name from hyphen index with hyphens
	if (period) //Add periods back if they originally existed
		for (i = 0; i < period_index.length; i++) //Cycle through period index
			name = name.substr(0, period_index[i]) + '.' + name.substr(period_index[i]+1);  //Replace positions in name from period index with period
	name = name.replace(/ De /gi, ' de '); //Replace ' De ' with ' de '
	name = name.replace(/ Dit /gi, ' dit '); //Replace ' Dit ' with ' dit '
	name = name.replace(/ Van /gi, ' van '); //Replace ' Van ' with ' van '
	lcName = name.toLowerCase(); //Copy of name in lower-case
	for (i = 0; i < surnames.length; i++) {
		pos = lcName.indexOf(surnames[i].toLowerCase());
		if (pos > -1) {
			if (((pos == 0) || (pos > 0 && name.charAt(pos-1) == ' ')) && ((name.length == pos+surnames[i].length) || (name.charAt(pos+surnames[i].length) == ' ')))
				name = name.substring(0, pos) + surnames[i] + name.substr(pos+surnames[i].length);
		}
	}
	return name;
}

I’d love to hear your comments!