flyingdutchie
Nov-18-2007, 06:31 PM
Hi everyone,
I wrote, for my SmugFig API, this piece of code that transforms a string of keywords into an ArrayList of these keywords. It is a snippet of Java code.
public SomeClass {
...
...
private static final String quotePattern = "(\"([^\"]+)\"[\\s,;]*)";
private static final String commaPattern = "(([^,;]+)\\s*[,;]?)";
private static final String spacePattern = "(([^,;\\s]+)\\s*)";
public static ArrayList<String> toKeywords(String keywords) {
return quoteParse(keywords);
}
private static ArrayList<String> quoteParse(String keywordsString) {
if (keywordsString == null)
return null;
if (keywordsString.length() == 0)
return new ArrayList<String>();
final ArrayList<String> retValue = new ArrayList<String>();
Pattern pat = Pattern.compile(quotePattern);
Matcher matcher = pat.matcher(keywordsString);
Set<String> keywords = new TreeSet<String>();
while (matcher.find()) {
String matchResult = matcher.group(2);
if (matchResult.length() > 0)
keywords.add(matchResult);
}
final String remainingNonQuotedWords = matcher.replaceAll("");
if (remainingNonQuotedWords.length() > 0) {
boolean isSpaceDelimited =
remainingNonQuotedWords.indexOf(',')<0 &&
remainingNonQuotedWords.indexOf(';')<0;
if (isSpaceDelimited) {
pat = Pattern.compile(spacePattern);
matcher = pat.matcher(remainingNonQuotedWords);
while (matcher.find()) {
isSpaceDelimited = true;
String matchResult = matcher.group(2).trim();
if (matchResult.length() > 0)
keywords.add(matchResult);
}
}
else {
pat = Pattern.compile(commaPattern);
matcher = pat.matcher(remainingNonQuotedWords);
while (matcher.find()) {
String matchResult = matcher.group(2).trim();
if (matchResult.length() > 0)
keywords.add(matchResult);
}
}
}
retValue.addAll(keywords);
return retValue;
}
...
...
}
It can parse
keywords seperated by spaces
Single word keywords. E.g.
wedding anderson ceremony
keywords seperated by commas or semicolons
Single or multi-word keywords. E.g.
boston, red sox; world championship
keywords seperated by spaces, commas or semicolons and which are quoted
"boston" "red sox"; "world championship"
Or any combination thereof:
"boston", red sox, world championship "parade"; helloIt took me a while to figure out how Smugmug parses keywords. The code above comes close to it, i think. It is not perfect, and i have tested it only a little bit. Let me know if it works for you.:D
(Updated it with new code to also handle keywords seperated by semicolons)
I wrote, for my SmugFig API, this piece of code that transforms a string of keywords into an ArrayList of these keywords. It is a snippet of Java code.
public SomeClass {
...
...
private static final String quotePattern = "(\"([^\"]+)\"[\\s,;]*)";
private static final String commaPattern = "(([^,;]+)\\s*[,;]?)";
private static final String spacePattern = "(([^,;\\s]+)\\s*)";
public static ArrayList<String> toKeywords(String keywords) {
return quoteParse(keywords);
}
private static ArrayList<String> quoteParse(String keywordsString) {
if (keywordsString == null)
return null;
if (keywordsString.length() == 0)
return new ArrayList<String>();
final ArrayList<String> retValue = new ArrayList<String>();
Pattern pat = Pattern.compile(quotePattern);
Matcher matcher = pat.matcher(keywordsString);
Set<String> keywords = new TreeSet<String>();
while (matcher.find()) {
String matchResult = matcher.group(2);
if (matchResult.length() > 0)
keywords.add(matchResult);
}
final String remainingNonQuotedWords = matcher.replaceAll("");
if (remainingNonQuotedWords.length() > 0) {
boolean isSpaceDelimited =
remainingNonQuotedWords.indexOf(',')<0 &&
remainingNonQuotedWords.indexOf(';')<0;
if (isSpaceDelimited) {
pat = Pattern.compile(spacePattern);
matcher = pat.matcher(remainingNonQuotedWords);
while (matcher.find()) {
isSpaceDelimited = true;
String matchResult = matcher.group(2).trim();
if (matchResult.length() > 0)
keywords.add(matchResult);
}
}
else {
pat = Pattern.compile(commaPattern);
matcher = pat.matcher(remainingNonQuotedWords);
while (matcher.find()) {
String matchResult = matcher.group(2).trim();
if (matchResult.length() > 0)
keywords.add(matchResult);
}
}
}
retValue.addAll(keywords);
return retValue;
}
...
...
}
It can parse
keywords seperated by spaces
Single word keywords. E.g.
wedding anderson ceremony
keywords seperated by commas or semicolons
Single or multi-word keywords. E.g.
boston, red sox; world championship
keywords seperated by spaces, commas or semicolons and which are quoted
"boston" "red sox"; "world championship"
Or any combination thereof:
"boston", red sox, world championship "parade"; helloIt took me a while to figure out how Smugmug parses keywords. The code above comes close to it, i think. It is not perfect, and i have tested it only a little bit. Let me know if it works for you.:D
(Updated it with new code to also handle keywords seperated by semicolons)