Localized 'time ago' parser for YouTube with multiple phrases for each time unit.

This commit is contained in:
wojcik-online 2018-01-28 19:47:12 +01:00
parent a5548b4dc2
commit 77a74a91c9
7 changed files with 86 additions and 35 deletions

View File

@ -16,6 +16,7 @@ import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.playlist.PlaylistExtractor;
import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
@ -26,6 +27,8 @@ import javax.annotation.Nonnull;
@SuppressWarnings("WeakerAccess")
public class YoutubePlaylistExtractor extends PlaylistExtractor<YoutubeService> {
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
/**
* It's lazily initialized (when getNextStreams is called)
@ -206,7 +209,7 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor<YoutubeService>
continue;
}
collector.commit(new YoutubeStreamInfoItemExtractor(li, null) {
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
public Element uploaderLink;
@Override

View File

@ -8,6 +8,7 @@ import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.search.InfoItemSearchCollector;
import org.schabi.newpipe.extractor.search.SearchEngine;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import java.io.IOException;
import java.net.URLEncoder;
@ -38,8 +39,11 @@ public class YoutubeSearchEngine extends SearchEngine {
private static final String TAG = YoutubeSearchEngine.class.toString();
public static final String CHARSET_UTF_8 = "UTF-8";
public YoutubeSearchEngine(int serviceId) {
private final TimeAgoParser timeAgoParser;
public YoutubeSearchEngine(int serviceId, TimeAgoParser timeAgoParser) {
super(serviceId);
this.timeAgoParser = timeAgoParser;
}
@Override
@ -107,7 +111,7 @@ public class YoutubeSearchEngine extends SearchEngine {
// video item type
} else if ((el = item.select("div[class*=\"yt-lockup-video\"]").first()) != null) {
collector.commit(new YoutubeStreamInfoItemExtractor(el, null));
collector.commit(new YoutubeStreamInfoItemExtractor(el, timeAgoParser));
} else if ((el = item.select("div[class*=\"yt-lockup-channel\"]").first()) != null) {
collector.commit(new YoutubeChannelInfoItemExtractor(el));
} else if ((el = item.select("div[class*=\"yt-lockup-playlist\"]").first()) != null &&

View File

@ -13,6 +13,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import java.io.IOException;
import java.util.EnumMap;
import java.util.Map;
/*
@ -37,13 +39,15 @@ import java.io.IOException;
public class YoutubeService extends StreamingService {
private Map<TimeAgoParser.TimeAgoUnit, String[]> timeAgoParserPhrases;
public YoutubeService(int id, String name) {
super(id, name);
}
@Override
public SearchEngine getSearchEngine() {
return new YoutubeSearchEngine(getServiceId());
return new YoutubeSearchEngine(getServiceId(), getTimeAgoParser());
}
@Override
@ -104,12 +108,36 @@ public class YoutubeService extends StreamingService {
return list;
}
/**
* Sets the phrases used to parse upload date in the format '2 days ago'.
* @param secondsPhrases How to recognize seconds
* @param minutesPhrases How to recognize minutes
* @param hoursPhrases How to recognize hours
* @param daysPhrases How to recognize days
* @param weeksPhrases How to recognize weeks
* @param monthsPhrases How to recognize months
* @param yearsPhrases How to recognize years
*/
public void setTimeAgoParserPhrases(String[] secondsPhrases, String[] minutesPhrases,
String[] hoursPhrases, String[] daysPhrases,
String[] weeksPhrases, String[] monthsPhrases,
String[] yearsPhrases) {
timeAgoParserPhrases = new EnumMap<>(TimeAgoParser.TimeAgoUnit.class);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.SECONDS, secondsPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.MINUTES, minutesPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.HOURS, hoursPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.DAYS, daysPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.WEEKS, weeksPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.MONTHS, monthsPhrases);
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.YEARS, yearsPhrases);
}
/**
* @return A helper to parse upload dates in the format '2 days ago'.
*
* TODO Introduce support for multiple languages.
*/
TimeAgoParser getTimeAgoParser() {
return new TimeAgoParser(TimeAgoParser.DEFAULT_AGO_PHRASES);
return new TimeAgoParser(timeAgoParserPhrases == null ?
TimeAgoParser.DEFAULT_AGO_PHRASES : timeAgoParserPhrases);
}
}

View File

@ -26,6 +26,7 @@ import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.SubtitlesFormat;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.stream.VideoStream;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
@ -87,6 +88,8 @@ public class YoutubeStreamExtractor extends StreamExtractor<YoutubeService> {
/*//////////////////////////////////////////////////////////////////////////*/
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
@Nullable
private JsonObject playerArgs;
@ -812,7 +815,7 @@ public class YoutubeStreamExtractor extends StreamExtractor<YoutubeService> {
* This is encapsulated in a StreamInfoItem object, which is a subset of the fields in a full StreamInfo.
*/
private StreamInfoItemExtractor extractVideoPreviewInfo(final Element li) {
return new YoutubeStreamInfoItemExtractor(li, null) {
return new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
@Override
public String getUrl() throws ParsingException {

View File

@ -31,6 +31,7 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.kiosk.KioskExtractor;
import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import java.io.IOException;
@ -38,7 +39,7 @@ import javax.annotation.Nonnull;
public class YoutubeTrendingExtractor extends KioskExtractor<YoutubeService> {
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
@ -91,7 +92,7 @@ public class YoutubeTrendingExtractor extends KioskExtractor<YoutubeService> {
for(Element ul : uls) {
for(final Element li : ul.children()) {
final Element el = li.select("div[class*=\"yt-lockup-dismissable\"]").first();
collector.commit(new YoutubeStreamInfoItemExtractor(li, null) {
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
@Override
public String getUrl() throws ParsingException {
try {

View File

@ -20,9 +20,9 @@ public class TimeAgoParser {
* A set of english phrases that are contained in the time units.
* (e.g. '7 minutes ago' contains 'min')
*/
public static Map<TimeAgoUnit, String> DEFAULT_AGO_PHRASES = new EnumMap<>(TimeAgoUnit.class);
public static Map<TimeAgoUnit, String[]> DEFAULT_AGO_PHRASES = new EnumMap<>(TimeAgoUnit.class);
private final Map<TimeAgoUnit, String> agoPhrases;
private final Map<TimeAgoUnit, String[]> agoPhrases;
private final Calendar consistentNow;
@ -33,73 +33,82 @@ public class TimeAgoParser {
* </p>
* @param agoPhrases A set of phrases how to recognize the time units in a given language.
*/
public TimeAgoParser(Map<TimeAgoUnit, String> agoPhrases) {
public TimeAgoParser(Map<TimeAgoUnit, String[]> agoPhrases) {
this.agoPhrases = agoPhrases;
consistentNow = Calendar.getInstance();
}
/**
* Parses a textual date in the format '2 days ago' into a Calendar representation.
* Sets the time to the beginning of the day/week/month/year if no exact time is available.
* @param textualDate The original date as provided by the streaming service
* @return The parsed (approximated) time
* @throws ParsingException if the time unit could not be recognized
*/
public Calendar parse(String textualDate) throws ParsingException {
try {
int timeAgoValue = parseTimeAgoValue(textualDate);
int timeAgoAmount = parseTimeAgoAmount(textualDate);
TimeAgoUnit timeAgoUnit = parseTimeAgoUnit(textualDate);
return getCalendar(timeAgoValue, timeAgoUnit);
return getCalendar(timeAgoAmount, timeAgoUnit);
} catch (NumberFormatException e) {
// If there is no valid number in the textual date, assume it is 'moments ago'.
return getCalendar(0, TimeAgoUnit.SECONDS);
}
}
private int parseTimeAgoValue(String textualDate) throws NumberFormatException {
private int parseTimeAgoAmount(String textualDate) throws NumberFormatException {
String timeValueStr = textualDate.replaceAll("\\D+", "");
return Integer.parseInt(timeValueStr);
}
private TimeAgoUnit parseTimeAgoUnit(String textualDate) throws ParsingException {
for (TimeAgoUnit timeAgoUnit : agoPhrases.keySet()) {
if (textualDate.contains(agoPhrases.get(timeAgoUnit))) {
return timeAgoUnit;
for (String agoPhrase : agoPhrases.get(timeAgoUnit)) {
if (textualDate.toLowerCase().contains(agoPhrase.toLowerCase())){
return timeAgoUnit;
}
}
}
throw new ParsingException("Unable to parse the date: " + textualDate);
}
private Calendar getCalendar(int timeAgoValue, TimeAgoUnit timeAgoUnit) {
private Calendar getCalendar(int timeAgoAmount, TimeAgoUnit timeAgoUnit) {
Calendar calendarTime = getNow();
switch (timeAgoUnit) {
case SECONDS:
calendarTime.add(Calendar.SECOND, -timeAgoValue);
calendarTime.add(Calendar.SECOND, -timeAgoAmount);
break;
case MINUTES:
calendarTime.add(Calendar.MINUTE, -timeAgoValue);
calendarTime.add(Calendar.MINUTE, -timeAgoAmount);
break;
case HOURS:
calendarTime.add(Calendar.HOUR_OF_DAY, -timeAgoValue);
calendarTime.add(Calendar.HOUR_OF_DAY, -timeAgoAmount);
break;
case DAYS:
calendarTime.add(Calendar.DAY_OF_MONTH, -timeAgoValue);
calendarTime.add(Calendar.DAY_OF_MONTH, -timeAgoAmount);
resetTimeOfDay(calendarTime);
break;
case WEEKS:
calendarTime.add(Calendar.WEEK_OF_MONTH, -timeAgoValue);
calendarTime.add(Calendar.WEEK_OF_YEAR, -timeAgoAmount);
calendarTime.set(Calendar.DAY_OF_WEEK, calendarTime.getFirstDayOfWeek());
resetTimeOfDay(calendarTime);
break;
case MONTHS:
calendarTime.add(Calendar.MONTH, -timeAgoValue);
calendarTime.add(Calendar.MONTH, -timeAgoAmount);
calendarTime.set(Calendar.DAY_OF_MONTH, 1);
resetTimeOfDay(calendarTime);
break;
case YEARS:
calendarTime.add(Calendar.YEAR, -timeAgoValue);
calendarTime.add(Calendar.YEAR, -timeAgoAmount);
calendarTime.set(Calendar.MONTH, 1);
calendarTime.set(Calendar.DAY_OF_MONTH, 1);
resetTimeOfDay(calendarTime);
@ -117,19 +126,20 @@ public class TimeAgoParser {
calendarTime.set(Calendar.HOUR_OF_DAY, 0);
calendarTime.set(Calendar.MINUTE, 0);
calendarTime.set(Calendar.SECOND, 0);
calendarTime.set(Calendar.MILLISECOND, 0);
}
static {
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.SECONDS, "sec");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MINUTES, "min");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.HOURS, "hour");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.DAYS, "day");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.WEEKS, "week");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MONTHS, "month");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.YEARS, "year");
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.SECONDS, new String[]{"sec"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MINUTES, new String[]{"min"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.HOURS, new String[]{"hour"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.DAYS, new String[]{"day"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.WEEKS, new String[]{"week"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MONTHS, new String[]{"month"});
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.YEARS, new String[]{"year"});
}
enum TimeAgoUnit {
public enum TimeAgoUnit {
SECONDS,
MINUTES,
HOURS,

View File

@ -9,6 +9,7 @@ import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.channel.ChannelInfoItem;
import org.schabi.newpipe.extractor.search.SearchEngine;
import org.schabi.newpipe.extractor.search.SearchResult;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import java.net.MalformedURLException;
import java.net.URL;
@ -45,7 +46,8 @@ public class YoutubeSearchEngineAllTest extends BaseYoutubeSearchTest {
@BeforeClass
public static void setUpClass() throws Exception {
NewPipe.init(Downloader.getInstance());
YoutubeSearchEngine engine = new YoutubeSearchEngine(1);
YoutubeSearchEngine engine = new YoutubeSearchEngine(1,
new TimeAgoParser(TimeAgoParser.DEFAULT_AGO_PHRASES));
result = engine.search("pewdiepie", 0, "de", SearchEngine.Filter.ANY)
.getSearchResult();